# Imports

In [24]:
import sqlite3, csv
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Helper Functions

In [None]:
# Read the CSV file as a database
def read_csvs_as_db(csv_files):
    conn = sqlite3.connect(':memory:')
    cursor = conn.cursor()
    for csv_file in csv_files:
        table_name = csv_file.split('/')[-1].split('.')[0]
        print(f'Processing {csv_file} into table {table_name}')
        with open(csv_file, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            headers = next(reader)
            # Add type affinity for better numeric handling
            columns = ', '.join([f'"{h}"' for h in headers])
            cursor.execute(f'CREATE TABLE IF NOT EXISTS {table_name} ({columns})')
            for row in reader:
                cursor.execute(f'INSERT INTO {table_name} VALUES ({", ".join(["?"] * len(row))})', row)
    conn.commit()
    return conn

# Normalize the dish names
def normalize_dish_names(csv_file):
    print(f'Normalizing dish names in {csv_file}')
    df = pd.read_csv(csv_file)
    # Only keep rows with a non-empty 'name'
    df = df[df['name'].notna()]
    # Apply normalization using pandas string methods and regex
    df['name'] = (
        df['name']
        .str.lower() # Convert to lowercase
        .str.replace(r'["“”‘’]+', '', regex=True) # Remove quotes
        .str.replace(r'^[^a-zA-Z0-9]+', '', regex=True) # Remove leading non-alphanumeric characters
        .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
        .str.strip() # Strip leading/trailing spaces
    )
    # Remove rows where 'name' is empty or 'nan'
    df = df[(df['name'] != '') & (df['name'] != 'nan')]
    # Save to new CSV
    df.to_csv(f'Normalized_{csv_file}', index=False)

# Run Cleaning Functions

In [26]:
normalize_dish_names('Dish.csv')

Normalizing dish names in Dish.csv


# Using SKLearn for Clustering

In [None]:
# Step 1: Load cleaned CSV
df = pd.read_csv('Normalized_Dish.csv')

# Step 2: Drop rows with empty or null 'name'
df_clean = df[df['name'].notna() & (df['name'].str.strip() != "")].copy()

# Step 3: Vectorize with character n-grams
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X = vectorizer.fit_transform(df_clean['name'])

# Step 4: Apply KMeans
k = int(len(df_clean) // 4)
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_clean['name_cluster'] = kmeans.fit_predict(X)

# Step 5: Merge back with original DataFrame
df = df.merge(df_clean[['name', 'name_cluster']], on='name', how='left')

# Step 6: Save to CSV
df.to_csv("Clustered_Dish.csv", index=False)


In [28]:
# Drop Description Column
df = df.drop(columns=['description'], errors='ignore')
# Save the cleaned DataFrame to a new CSV
df.to_csv("Cleaned_Dish.csv", index=False)

# Clustering We Wanted to Run (Took To Long)
### Stopped after 6hrs

In [None]:
import pandas as pd
from rapidfuzz import fuzz, process
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load your cleaned CSV
df = pd.read_csv('Normalized_Dish.csv')  # Adjust path as needed
names = df['name'].dropna().unique()

# Parameters
SIMILARITY_THRESHOLD = 90  # Adjust based on how strict you want matching
MAX_WORKERS = 16  # Use as many threads as your CPU can handle

# Shared results
visited = set()
clusters = []

def find_matches(name):
    """Return cluster of names similar to the given name."""
    if name in visited:
        return []
    matches = process.extract(
        name, names, scorer=fuzz.token_sort_ratio, score_cutoff=SIMILARITY_THRESHOLD
    )
    cluster = [match[0] for match in matches]
    return cluster

# Step 1: Build clusters in parallel
print(f"Clustering {len(names)} unique names with {MAX_WORKERS} threads...")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(find_matches, name): name for name in names}

    for future in as_completed(futures):
        cluster = future.result()
        if not cluster:
            continue
        # Only process cluster if none of them have already been visited
        if not any(name in visited for name in cluster):
            clusters.append(cluster)
            for name in cluster:
                visited.add(name)

print(f"Identified {len(clusters)} clusters")

# Step 2: Build canonical mapping
name_to_canonical = {}
for cluster in clusters:
    canonical = min(cluster, key=len)
    for name in cluster:
        name_to_canonical[name] = canonical

# Step 3: Apply mapping
df['normalized_name'] = df['name'].map(name_to_canonical).fillna(df['name'])

# Step 4: Save result
df.to_csv('Fuzzy_Cleaned_Dish.csv', index=False)
print("✅ Saved clustered version to Fuzzy_Cleaned_Dish.csv")