<a href="https://colab.research.google.com/github/Bayhaqieee/SearchSort_OlistDatasets/blob/main/SearchSort_Olist_Team_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Binary Search - Counting Sort

In [None]:
import pandas as pd
import time
from tabulate import tabulate

# Load dataset
GEO_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_geolocation_dataset.csv'
SELLER_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_sellers_dataset.csv'

# Load the dataframes
try:
    geo_df = pd.read_csv(GEO_URL)
    seller_df = pd.read_csv(SELLER_URL)
except FileNotFoundError:
    print("Pastikan dataset berada di lokasi yang benar.")
    geo_df = pd.DataFrame()
    seller_df = pd.DataFrame()

# Fungsi Binary Search
def benchmark_binary_search(df, column):
    if df.empty or column not in df.columns:
        return "-"
    df_sorted = df.sort_values(by=column)
    data = df_sorted[column].dropna().values
    if len(data) == 0:
        return "-"
    target = data[len(data)//2]
    start = time.time()
    low, high = 0, len(data) - 1
    str_data = [str(item) for item in data]
    str_target = str(target)
    while low <= high:
        mid = (low + high) // 2
        if str_data[mid] == str_target:
            break
        elif str_data[mid] < str_target:
            low = mid + 1
        else:
            high = mid - 1
    end = time.time()
    return round(end - start, 6)

# Jalankan benchmarking pada semua kolom
def run_full_benchmark(df, dataset_name):
    results = []
    if df.empty:
        print(f"Skipping benchmark for {dataset_name} as dataframe is empty.")
        return []
    for col in df.columns:
        sort_time = "-"
        search_time = "-"
        try:
            start = time.time()
            df.sort_values(by=col, inplace=False)
            sort_time = round(time.time() - start, 6)
        except Exception:
            pass
        try:
            search_time = benchmark_binary_search(df, col)
        except Exception:
            pass
        if isinstance(sort_time, float) and isinstance(search_time, float):
            total = round(sort_time + search_time, 6)
        else:
            total = "-"
        results.append([
            dataset_name, col, sort_time, search_time, total
        ])
    return results

# Eksekusi benchmark
geo_benchmark = run_full_benchmark(geo_df, "Geolocation Dataset")
seller_benchmark = run_full_benchmark(seller_df, "Sellers Dataset")

# Gabungkan hasil dan tampilkan
all_results = geo_benchmark + seller_benchmark
df_summary = pd.DataFrame(all_results, columns=["Dataset", "Column", "Sort Time", "Binary Search Time", "Total Time"])
print(tabulate(df_summary, headers='keys', tablefmt='grid'))

+----+---------------------+-----------------------------+-------------+----------------------+--------------+
|    | Dataset             | Column                      |   Sort Time |   Binary Search Time |   Total Time |
|  0 | Geolocation Dataset | geolocation_zip_code_prefix |    0.130619 |             0.215962 |     0.346581 |
+----+---------------------+-----------------------------+-------------+----------------------+--------------+
|  1 | Geolocation Dataset | geolocation_lat             |    0.123619 |             0.825702 |     0.949321 |
+----+---------------------+-----------------------------+-------------+----------------------+--------------+
|  2 | Geolocation Dataset | geolocation_lng             |    0.117843 |             0.8057   |     0.923543 |
+----+---------------------+-----------------------------+-------------+----------------------+--------------+
|  3 | Geolocation Dataset | geolocation_city            |    0.873691 |             0.054933 |     0.928624 |
+

# Jump Search - Heap Sort

In [None]:
import pandas as pd
import time
import math
from tabulate import tabulate
import heapq

# Load dataset
GEO_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_geolocation_dataset.csv'
SELLER_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_sellers_dataset.csv'

geo_df = pd.read_csv(GEO_URL)
seller_df = pd.read_csv(SELLER_URL)

# Heap Sort
def benchmark_heap_sort(df, column):
    data = df[column].dropna().tolist()
    try:
        start = time.time()
        heapq.heapify(data)
        sorted_data = [heapq.heappop(data) for _ in range(len(data))]
        end = time.time()
        return round(end - start, 6)
    except:
        return "-"

# Jump Search
def jump_search(arr, x):
    n = len(arr)
    step = int(math.sqrt(n))
    prev = 0
    while prev < n and arr[min(step, n)-1] < x:
        prev = step
        step += int(math.sqrt(n))
        if prev >= n:
            return -1
    for i in range(prev, min(step, n)):
        if arr[i] == x:
            return i
    return -1

def benchmark_jump_search(df, column, target):
    try:
        data = sorted(df[column].dropna().astype(str).values)
        start = time.time()
        jump_search(data, str(target))
        end = time.time()
        return round(end - start, 6)
    except:
        return "-"

# Jalankan benchmark untuk semua kolom
def run_detailed_benchmark(df, dataset_name):
    results = []
    for col in df.columns:
        sort_time = benchmark_heap_sort(df, col)
        try:
            target = df[col].dropna().iloc[len(df)//2]
            search_time = benchmark_jump_search(df, col, target)
        except:
            search_time = "-"
        results.append([dataset_name, col, "Heap Sort", sort_time])
        results.append([dataset_name, col, "Jump Search", search_time])
    return results

# Tampilkan ringkasan dan total waktu
def summarize_results(results):
    df = pd.DataFrame(results, columns=["Dataset", "Column", "Operation", "Time (s)"])
    pivot = df.pivot_table(index=["Dataset", "Column"], columns="Operation", values="Time (s)").reset_index()

    # Hitung total waktu jika dua-duanya numerik
    def calculate_total(row):
        sort_time = row.get("Heap Sort", "-")
        search_time = row.get("Jump Search", "-")
        if isinstance(sort_time, (int, float)) and isinstance(search_time, (int, float)):
            return round(sort_time + search_time, 6)
        return "-"

    pivot["Total Time (s)"] = pivot.apply(calculate_total, axis=1)
    return pivot

# Jalankan benchmark
geo_results = run_detailed_benchmark(geo_df, "Geolocation Dataset")
seller_results = run_detailed_benchmark(seller_df, "Sellers Dataset")  # Ganti dari produk ke seller

# Gabungkan dan tampilkan
all_results = geo_results + seller_results
summary_df = summarize_results(all_results)

# Cetak tabel
print(tabulate(summary_df, headers='keys', tablefmt='grid', showindex=False))


+---------------------+-----------------------------+-------------+---------------+------------------+
| Dataset             | Column                      |   Heap Sort |   Jump Search |   Total Time (s) |
| Geolocation Dataset | geolocation_city            |    0.690356 |      0.000412 |         0.690768 |
+---------------------+-----------------------------+-------------+---------------+------------------+
| Geolocation Dataset | geolocation_lat             |    0.983726 |      0.000388 |         0.984114 |
+---------------------+-----------------------------+-------------+---------------+------------------+
| Geolocation Dataset | geolocation_lng             |    0.913887 |      0.000283 |         0.91417  |
+---------------------+-----------------------------+-------------+---------------+------------------+
| Geolocation Dataset | geolocation_state           |    1.10639  |      0.000279 |         1.10667  |
+---------------------+-----------------------------+-------------+------

# Jump Search - Merge Sort

In [None]:
import pandas as pd
import time
import math
from tabulate import tabulate

# Load dataset
GEO_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_geolocation_dataset.csv'
SELLER_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_sellers_dataset.csv'

geo_df = pd.read_csv(GEO_URL)
seller_df = pd.read_csv(SELLER_URL)

# Merge Sort
def merge_sort(arr):
    if len(arr) <= 1:
        return arr
    mid = len(arr) // 2
    left = merge_sort(arr[:mid])
    right = merge_sort(arr[mid:])
    return merge(left, right)

def merge(left, right):
    merged = []
    i = j = 0
    while i < len(left) and j < len(right):
        if str(left[i]) <= str(right[j]):
            merged.append(left[i])
            i += 1
        else:
            merged.append(right[j])
            j += 1
    merged += left[i:]
    merged += right[j:]
    return merged

def benchmark_merge_sort(df, column):
    data = df[column].dropna().tolist()
    start = time.time()
    merge_sort(data)
    end = time.time()
    return end - start

# Jump Search
def jump_search(arr, x):
    n = len(arr)
    step = int(math.sqrt(n))
    prev = 0
    while prev < n and arr[min(step, n)-1] < x:
        prev = step
        step += int(math.sqrt(n))
        if prev >= n:
            return -1
    for i in range(prev, min(step, n)):
        if arr[i] == x:
            return i
    return -1

def benchmark_jump_search(df, column, target):
    data = sorted(df[column].dropna().astype(str).values)
    start = time.time()
    jump_search(data, str(target))
    end = time.time()
    return end - start

# Benchmark semua kolom dalam dataset
def run_full_benchmark(df, dataset_name):
    results = []
    for col in df.columns:
        try:
            sort_time = benchmark_merge_sort(df, col)
        except:
            sort_time = "-"
        try:
            target = df[col].dropna().astype(str).iloc[len(df) // 2]
            search_time = benchmark_jump_search(df, col, target)
        except:
            search_time = "-"
        results.append([
            dataset_name,
            col,
            f"{sort_time:.6f}" if isinstance(sort_time, float) else "-",
            f"{search_time:.6f}" if isinstance(search_time, float) else "-",
            f"{(sort_time + search_time):.6f}" if isinstance(sort_time, float) and isinstance(search_time, float) else "-"
        ])
    return results

# Jalankan benchmark
geo_results = run_full_benchmark(geo_df, "Geolocation Dataset")
seller_results = run_full_benchmark(seller_df, "Sellers Dataset")  # Ganti dari produk ke seller

# Gabungkan hasil dan tampilkan
columns = ["Dataset", "Column", "Merge Sort Time", "Jump Search Time", "Total Time"]
benchmark_df = pd.DataFrame(geo_results + seller_results, columns=columns)

# Tampilkan tabel
print(tabulate(benchmark_df, headers='keys', tablefmt='grid'))

+----+---------------------+-----------------------------+-------------------+--------------------+--------------+
|    | Dataset             | Column                      |   Merge Sort Time |   Jump Search Time |   Total Time |
|  0 | Geolocation Dataset | geolocation_zip_code_prefix |          6.40993  |           0.000318 |     6.41025  |
+----+---------------------+-----------------------------+-------------------+--------------------+--------------+
|  1 | Geolocation Dataset | geolocation_lat             |         34.6054   |           0.000501 |    34.6059   |
+----+---------------------+-----------------------------+-------------------+--------------------+--------------+
|  2 | Geolocation Dataset | geolocation_lng             |         32.6697   |           0.000259 |    32.6699   |
+----+---------------------+-----------------------------+-------------------+--------------------+--------------+
|  3 | Geolocation Dataset | geolocation_city            |          5.84117  |  

# Hash Search - Selection Sort

In [None]:
import pandas as pd
import time
from tabulate import tabulate

# Load datasets
GEO_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_geolocation_dataset.csv'
SELLER_URL = '/content/drive/MyDrive/Kuliah/Analgo/Dataset/olist_sellers_dataset.csv'

geo_df = pd.read_csv(GEO_URL)
seller_df = pd.read_csv(SELLER_URL)

# Auto sample function
def auto_sample(df, max_rows=10000):
    return df.sample(n=max_rows, random_state=42) if len(df) > max_rows else df.copy()

# Selection sort
def selection_sort(arr):
    arr = arr.copy()
    for i in range(len(arr)):
        min_idx = i
        for j in range(i+1, len(arr)):
            if str(arr[j]) < str(arr[min_idx]):
                min_idx = j
        arr[i], arr[min_idx] = arr[min_idx], arr[i]
    return arr

def benchmark_selection_sort(df, column):
    data = df[column].dropna().tolist()
    start = time.time()
    selection_sort(data)
    end = time.time()
    return end - start

# Hash search
def hash_search(data_dict, target):
    return data_dict.get(target, None)

def benchmark_hash_search(df, column, target):
    data = df[column].dropna().astype(str).tolist()
    hash_map = {val: i for i, val in enumerate(data)}
    start = time.time()
    hash_search(hash_map, str(target))
    end = time.time()
    return end - start

# Run benchmark on all columns
def run_full_benchmark(df, dataset_name):
    df = auto_sample(df)
    results = []

    for col in df.columns:
        try:
            sort_time = benchmark_selection_sort(df, col)
        except Exception as e:
            print(f"[SORT] Error on {col}: {e}")
            sort_time = "-"

        try:
            target = df[col].dropna().iloc[len(df) // 2]
            search_time = benchmark_hash_search(df, col, target)
        except Exception as e:
            print(f"[HASH] Error on {col}: {e}")
            search_time = "-"

        if isinstance(sort_time, float) and isinstance(search_time, float):
            total_time = sort_time + search_time
        else:
            total_time = "-"

        results.append([
            dataset_name,
            col,
            f"{sort_time:.6f}" if isinstance(sort_time, float) else "-",
            f"{search_time:.6f}" if isinstance(search_time, float) else "-",
            f"{total_time:.6f}" if isinstance(total_time, float) else "-"
        ])
    return results

# Jalankan benchmark
geo_results = run_full_benchmark(geo_df, "Geolocation Dataset")
seller_results = run_full_benchmark(seller_df, "Sellers Dataset")  # Ganti dari produk ke seller

# Gabungkan hasil dan tampilkan
final_results = geo_results + seller_results
df_result = pd.DataFrame(final_results, columns=["Dataset", "Column", "Selection Sort Time", "Hash Search Time", "Total Time"])

print(tabulate(df_result, headers='keys', tablefmt='grid'))

+----+---------------------+-----------------------------+-----------------------+--------------------+--------------+
|    | Dataset             | Column                      |   Selection Sort Time |   Hash Search Time |   Total Time |
|  0 | Geolocation Dataset | geolocation_zip_code_prefix |             12.1058   |            7e-06   |    12.1058   |
+----+---------------------+-----------------------------+-----------------------+--------------------+--------------+
|  1 | Geolocation Dataset | geolocation_lat             |             83.2189   |            1.2e-05 |    83.2189   |
+----+---------------------+-----------------------------+-----------------------+--------------------+--------------+
|  2 | Geolocation Dataset | geolocation_lng             |             81.5352   |            8e-06   |    81.5352   |
+----+---------------------+-----------------------------+-----------------------+--------------------+--------------+
|  3 | Geolocation Dataset | geolocation_city   