# Extreme Sparsity

This notebook seeks to investigate the effect of extreme sparsity on the performance of the model. This should be the "raw" approach to the problem without any assumptions other than in regards to the model.
Because of the extreme sparsity the choice of clustering model is important. The model should be able to handle the extreme sparsity and be able to cluster the data in a way that is meaningful. The model should also be able to handle the large number of features. 

The considered and tested models are:
- K-Means with sparse initialization.
- 

In [15]:
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from typing import Dict
from scipy.sparse.csr import csr_matrix


  from scipy.sparse.csr import csr_matrix


In [16]:
# load data from json
with open('../data/raw/train.json') as f:
    data = json.load(f)

# number of cuisines
cuisines = [d['cuisine'] for d in data]
number_of_cuisines = len(set(cuisines))
print(f"Number of cuisines: {number_of_cuisines}")


Number of cuisines: 20


In [17]:
print(set(cuisines))

{'russian', 'greek', 'french', 'jamaican', 'indian', 'japanese', 'italian', 'chinese', 'filipino', 'moroccan', 'vietnamese', 'spanish', 'mexican', 'brazilian', 'british', 'thai', 'korean', 'irish', 'cajun_creole', 'southern_us'}


In [18]:
# 1. Generate the corpus of all ingredients.
updated_data = []
corpus = []
for i, entry in enumerate(data):
    entry_ingredients = [
        ingredient.replace(" ", "") for ingredient in entry["ingredients"]
    ]
    corpus.append(" ".join(entry_ingredients))
    entry["representation_idx"] = i
    updated_data.append(entry.copy())

# 2. Create the count vectors.
vectorizer = CountVectorizer(min_df=1)
one_hot_representation = vectorizer.fit_transform(corpus)
vocab = vectorizer.vocabulary_

In [19]:
assert one_hot_representation.shape[0] == len(data)

In [20]:
from collections import defaultdict
from sklearn import metrics
from time import time

evaluations = []
evaluations_std = []


def fit_and_evaluate(km, X, labels,n_runs=5):
    name = km.__class__.__name__

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    print(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    evaluations.append(evaluation)
    evaluations_std.append(evaluation_std)

In [21]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=number_of_cuisines,
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, cuisines)

clustering done in 2.42 ± 0.61 s 
Homogeneity: 0.174 ± 0.013
Completeness: 0.159 ± 0.011
V-measure: 0.166 ± 0.012
Adjusted Rand-Index: 0.058 ± 0.006
Silhouette Coefficient: -0.002 ± 0.002


In [22]:
# Run the same experiment but with regions instead
with open("../data/regions.json") as infile:
    regions = json.load(infile)

region_labels = [regions[d["cuisine"]] for d in data]

from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(set(region_labels)),
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, region_labels)

clustering done in 0.50 ± 0.09 s 
Homogeneity: 0.103 ± 0.031
Completeness: 0.094 ± 0.029
V-measure: 0.099 ± 0.030
Adjusted Rand-Index: 0.035 ± 0.013
Silhouette Coefficient: 0.024 ± 0.003


In [23]:
# for each cuisine, find the 10 most common ingredients
cuisine_ingredients = defaultdict(list)
for entry in updated_data:
    cuisine = entry["cuisine"]
    ingredients = entry["ingredients"]
    cuisine_ingredients[cuisine].extend(ingredients)

ingredients_to_use = list()

for cuisine, ingredients in cuisine_ingredients.items():
    from collections import Counter
    c = Counter(ingredients)
    most_common = c.most_common(10)
    ingredients_to_use.extend([ingredient for ingredient, _ in most_common])

# remove duplicates
ingredients_to_use = list(set(ingredients_to_use))


In [24]:
# 1. Generate the corpus of all ingredients.
updated_data = []
corpus = []
for i, entry in enumerate(data):
    entry_ingredients = [
        ingredient.replace(" ", "") for ingredient in entry["ingredients"]
    ]
    corpus.append(" ".join(entry_ingredients))
    entry["representation_idx"] = i
    updated_data.append(entry.copy())

# 2. Create the count vectors.
vectorizer = CountVectorizer(vocabulary=ingredients_to_use)
one_hot_representation = vectorizer.fit_transform(corpus)
vocab = vectorizer.vocabulary_

In [25]:
one_hot_representation.shape

(39774, 63)

In [26]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(set(region_labels)),
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, cuisines)

clustering done in 0.19 ± 0.07 s 
Homogeneity: 0.028 ± 0.004
Completeness: 0.045 ± 0.006
V-measure: 0.035 ± 0.004
Adjusted Rand-Index: 0.022 ± 0.005
Silhouette Coefficient: 0.212 ± 0.008


In [27]:
# Run the same experiment but with regions instead
with open("../data/regions.json") as infile:
    regions = json.load(infile)

region_labels = [regions[d["cuisine"]] for d in data]

from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(set(region_labels)),
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, region_labels)

clustering done in 0.20 ± 0.05 s 
Homogeneity: 0.021 ± 0.004
Completeness: 0.019 ± 0.003
V-measure: 0.020 ± 0.003
Adjusted Rand-Index: 0.013 ± 0.002
Silhouette Coefficient: 0.209 ± 0.007


In [28]:
# Run the same 
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(set(cuisines))*10,
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, cuisines)

clustering done in 4.98 ± 0.07 s 
Homogeneity: 0.152 ± 0.002
Completeness: 0.091 ± 0.001
V-measure: 0.114 ± 0.001
Adjusted Rand-Index: 0.024 ± 0.000
Silhouette Coefficient: 0.621 ± 0.011


In [29]:
# collect top ten ingredients for each region
cuisine_ingredients = defaultdict(list)
for entry in updated_data:
    cuisine = entry["cuisine"]
    ingredients = entry["ingredients"]
    cuisine_ingredients[regions[cuisine]].extend(ingredients)

ingredients_to_use = list()

for cuisine, ingredients in cuisine_ingredients.items():
    from collections import Counter
    c = Counter(ingredients)
    most_common = c.most_common(10)
    ingredients_to_use.extend([ingredient for ingredient, _ in most_common])

# remove duplicates
ingredients_to_use = list(set(ingredients_to_use))

len(ingredients_to_use)

30

In [30]:
# 1. Generate the corpus of all ingredients.
updated_data = []
corpus = []
for i, entry in enumerate(data):
    entry_ingredients = [
        ingredient.replace(" ", "") for ingredient in entry["ingredients"]
    ]
    corpus.append(" ".join(entry_ingredients))
    entry["representation_idx"] = i
    updated_data.append(entry.copy())

# 2. Create the count vectors.
vectorizer = CountVectorizer(vocabulary=ingredients_to_use)
one_hot_representation = vectorizer.fit_transform(corpus)
vocab = vectorizer.vocabulary_
one_hot_representation.shape

(39774, 30)

In [31]:
# Run the same 
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=len(set(region_labels))*2,
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, one_hot_representation, region_labels)

clustering done in 0.26 ± 0.03 s 
Homogeneity: 0.036 ± 0.004
Completeness: 0.024 ± 0.002
V-measure: 0.029 ± 0.003
Adjusted Rand-Index: 0.013 ± 0.001
Silhouette Coefficient: 0.456 ± 0.016
