# Exploratory Data Analysis of Epicurious Scrape in a JSON file

This is an idealized workflow for Aaron Chen in looking at data science problems. It likely isn't the best path, nor has he rigidly applied or stuck to this ideal, but he wishes that he worked this way more frequently.

## Purpose: Work through some exploratory data analysis of the Epicurious scrape on stream. Try to write some functions to help process the data.

### Author: Aaron Chen


---

### If needed, run shell commands here

In [1]:
# !python -m spacy download en_core_web_sm
# !python -c "import tkinter"

---

## External Resources

List out references or documentation that has helped you with this notebook

### Code
Regex Checker: https://regex101.com/

#### Scikit-learn
1. https://scikit-learn.org/stable/modules/decomposition.html#latent-dirichlet-allocation-lda
2. 

### Data

For this notebook, the data is stored in the repo base folder/data/raw

### Process

Are there steps or tutorials you are following? Those are things I try to list in Process

___

## Import necessary libraries

In [2]:
from adjustText import adjust_text
from bokeh import palettes
from bokeh.models import ColumnDataSource, HoverTool, Label, LabelSet
from bokeh.plotting import figure, output_file, save, show
from bokeh.io import output_notebook
from datetime import datetime
from joblib import dump, load
import matplotlib.pyplot as plt
import matplotlib.text as mlt
import numpy as np
from openTSNE import TSNE
import pandas as pd
from sklearn import tree
from sklearn.base import TransformerMixin
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import spacy
from tkinter import N
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm
from turtle import color
from typing import Any
# import umap



---

## Define helper functions

My workflow is to try things with code cells, then when the code cells get messy and repetitive, to convert into helper functions that can be called.

When the helper functions are getting used a lot, it is usually better to convert them to scripts or classes that can be called/instantiated

In [3]:
# def plot_top_words(model, feature_names, n_top_words, title):
#     fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
#     axes = axes.flatten()
#     for topic_idx, topic in enumerate(model.components_):
#         top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
#         top_features = [feature_names[i] for i in top_features_ind]
#         weights = topic[top_features_ind]

#         ax = axes[topic_idx]
#         ax.barh(top_features, weights, height=0.7)
#         ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
#         ax.invert_yaxis()
#         ax.tick_params(axis="both", which="major", labelsize=20)
#         for i in "top right left".split():
#             ax.spines[i].set_visible(False)
#         fig.suptitle(title, fontsize=40)

In [4]:
# def concat_matrices_to_df(df, vectorized_ingred_matrix, cv):
#     """This function takes in a dataframe and concats the matrix generated by either CountVectorizer or TFIDF-Transformer onto the records so that the recipes can be used for classification purposes.

#     Args: 
#         df: preprocessed dataframe from preprocess_dataframe
#         vectorized_ingred_matrix: sparse csr matrix created from doing fit_transform on the recipe_megalist
     
#     Returns:
#         A pandas dataframe with the vectorized_ingred_matrix appended as columns to df
#     """
#     repo_tfidf_df = pd.DataFrame(vectorized_ingred_matrix.toarray(), columns=cv.get_feature_names_out(), index=df.index)
#     return pd.concat([df, repo_tfidf_df], axis=1)

In [5]:
# def plot_3d(points, points_color, title):
#     x, y, z = points.T

#     fig, ax = plt.subplots(
#         figsize=(6, 6),
#         facecolor="white",
#         tight_layout=True,
#         subplot_kw={"projection": "3d"},
#     )
#     fig.suptitle(title, size=16)
#     col = ax.scatter(x, y, z, c=points_color, s=50, alpha=0.8)
#     ax.view_init(azim=-60, elev=9)
#     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
#     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
#     ax.zaxis.set_major_locator(ticker.MultipleLocator(1))

#     fig.colorbar(col, ax=ax, orientation="horizontal", shrink=0.6, aspect=60, pad=0.01)
#     plt.show()

In [6]:
# def add_2d_scatter(ax, points, points_color, title=None):
#     x, y = points.T
#     ax.scatter(x, y, c=points_color, s=50, alpha=0.8)
#     ax.set_title(title)
#     ax.xaxis.set_major_formatter(ticker.NullFormatter())
#     ax.yaxis.set_major_formatter(ticker.NullFormatter())

In [7]:
# def plot_2d(points, points_color, title):
#     fig, ax = plt.subplots(figsize=(3, 3), facecolor="white", constrained_layout=True)
#     fig.suptitle(title, size=16)
#     add_2d_scatter(ax, points, points_color)
#     plt.show()

### Import local script

I started grouping this in with importing libraries, but putting them at the bottom of the list

In [8]:
import project_path

import src.dataframe_preprocessor as dfpp
import src.nlp_processor as nlp_proc

---

## Define global variables 
### Remember to refactor these out, not ideal

In [9]:
output_notebook()

In [10]:
joblib_basepath = '../../joblib/2022.08.23/'

cv_path = joblib_basepath + 'countvec.joblib'
tfidf_path = joblib_basepath + 'tfidf.joblib'
full_df_path = joblib_basepath + 'recipes_with_cv.joblib'
reduced_df_path = joblib_basepath + 'reduced_df.joblib'
rfc_path = joblib_basepath + 'rfc_clf.joblib'
X_path = joblib_basepath + 'X.joblib'
y_path = joblib_basepath + 'y.joblib'
truncSVD_path = joblib_basepath + 'truncSVD.joblib'
truncSVD_transformed_path = joblib_basepath + 'truncSVD_transformed.joblib'
svd_numpy_path = joblib_basepath + 'SVD_numpy.joblib'
to_plot_path = joblib_basepath + 'to_plot.joblib'
tsne_path = joblib_basepath + 'tsne.joblib'
tsne_vis_path = joblib_basepath + 'tsne_vis.joblib'
kmeans_path = joblib_basepath + 'kmeans.joblib'

---

## Running Commentary

1. I used numbered lists to keep track of things I noticed

### To Do

1. Try to determine consistency of nested data structures
   1. Is the photoData or number of things inside photoData the same from record to record
   2. What about for tag?

Data wasn't fully consistent but logic in helper function helped handle nulls

2. How to handle nulls?
   1. Author      Filled in with "Missing Author"
   2. Tag         Filled in with "Missing Cuisine"
3. ~~Convert pubDate to actual timestamp~~  
4. ~~Convert ScrapeDate to actual timestamp~~
   1. This was ignored as the datestamp was not useful (generally within minutes of the origin of UNIX time)
   
**5. Append new columns for relevant nested structures and unfold them**

6. Determine actual types of `ingredients` and `prepSteps`
7. Continue working through test example of single recipe to feed into spaCy and then sklearn.feature_extraction.text stack
8. Will need to remove numbers, punctuation

---

## Importing and viewing the data as a dataframe

In [11]:
cv = load(cv_path)
tfidf = load(tfidf_path)
recipes_with_cv = load(full_df_path)
recipes_with_cv.set_index('id', inplace=True, drop=True)
reduced_df = load(reduced_df_path)
reduced_df.set_index('id', inplace=True, drop=True)
rfc_clf = load(rfc_path)
X = load(X_path)
y = load(y_path)
truncSVD = load(truncSVD_path)
X_train_svdTransform = load(truncSVD_transformed_path)
transformed_np = load(svd_numpy_path)
to_plot_tsne = load(to_plot_path)
t_sne = load(tsne_path)
vis_t_sne = load(tsne_vis_path)
kmeans_12 = load(kmeans_path)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=240, stratify=y)

In [13]:
recipes_with_cv

Unnamed: 0_level_0,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,cuisine_name,photo_filename,photo_credit,...,zest pith,zest vegetable,zinfandel,ziti,zucchini,zucchini blossom,zucchini crookneck,zucchini squash,árbol,árbol pepper
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,Missing Cuisine,51247610_fried-chicken_1x1.jpg,Michael Graydon and Nikole Herriott,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,Italian,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,Kosher,EP_09022015_honeycake-2.jpg,"Photo by Chelsea Kyle, Food Styling by Anna St...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.00,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,Kosher,EP_12162015_placeholders_casual.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,Kosher,EP_12162015_placeholders_formal.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59541a31bff3052847ae2107,Buttering the bread before you waffle it ensur...,Waffled Ham and Cheese Melt with Maple Butter,0.00,"[1 tablespoon unsalted butter, at room tempera...","[Preheat the waffle iron on low., Spread a thi...",0,0,Missing Cuisine,waffle-ham-and-cheese-melt-062817.jpg,"Photo by Maes Studio, Inc.",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5954233ad52ca90dc28200e7,"Spread this easy compound butter on waffles, p...",Maple Butter,0.00,"[8 tablespoons (1 stick) salted butter, at roo...",[Combine the ingredients in a medium-size bowl...,0,0,Missing Cuisine,EP_12162015_placeholders_bright.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595424c2109c972493636f83,Leftover mac and cheese is not exactly one of ...,Waffled Macaroni and Cheese,0.00,"[3 tablespoons unsalted butter, plus more for ...",[Preheat the oven to 375°F. Butter a 9x5-inch ...,0,0,Missing Cuisine,waffle-mac-n-cheese-062816.jpg,"Photo by Maes Studio, Inc.",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5956638625dc3d1d829b7166,A classic Mexican beer cocktail you can sip al...,Classic Michelada,0.00,"[Coarse salt, 2 lime wedges, 2 ounces tomato j...",[Place about 1/4 cup salt on a small plate. Ru...,0,0,Missing Cuisine,Classic Michelada 07292017.jpg,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
sparse = recipes_with_cv.drop(['dek', 'hed', 'aggregateRating', 'ingredients', 'prepSteps', 'reviewsCount',	'willMakeAgainPct', 'cuisine_name', 'photo_filename', 'photo_credit', 'author_name', 'date_published', 'recipe_url'], axis=1)
# sparse.set_index('id', inplace=True)

In [15]:
important_ingreds_indices = sparse.apply(lambda x: x.argsort()[-5:].values.tolist(), axis=1)

In [16]:
important_ingreds_indices

id
54a2b6b019925f464b373351     [1240, 704, 1976, 1980, 2684]
54a408a019925f464b3733bc    [2881, 2329, 2697, 2037, 2330]
54a408a26529d92b2c003631    [2959, 3027, 2579, 1557, 3280]
54a408a66529d92b2c003638      [2596, 133, 1519, 167, 1781]
54a408a719925f464b3733cc     [2175, 860, 3065, 1358, 3055]
                                         ...              
59541a31bff3052847ae2107    [1466, 1281, 1282, 3236, 2904]
5954233ad52ca90dc28200e7     [1111, 469, 2981, 1821, 1824]
595424c2109c972493636f83      [2180, 3236, 2904, 638, 648]
5956638625dc3d1d829b7166    [3306, 3307, 2633, 1746, 1679]
59566daa25dc3d1d829b7169     [2560, 256, 2847, 1441, 1440]
Length: 34656, dtype: object

In [17]:
important_ingredients = pd.DataFrame(data={'important_ingredients':[sparse.loc[idx].iloc[important_ingreds_indices.loc[idx]].index.tolist() for idx in sparse.index]}, index=important_ingreds_indices.index)

In [18]:
important_ingredients

Unnamed: 0_level_0,important_ingredients
id,Unnamed: 1_level_1
54a2b6b019925f464b373351,"[flaky salt, chicken vegetable, mustard, musta..."
54a408a019925f464b3733bc,"[spinach, pine, seedless, nut, pine nut]"
54a408a26529d92b2c003631,"[sugar sugar, tea, rye, honey sugar, whisky]"
54a408a66529d92b2c003638,"[salmon, avocado, hass, bagel, lox]"
54a408a719925f464b3733cc,"[paprika, clove garlic, tomato paste, garlic s..."
...,...
59541a31bff3052847ae2107,"[gruyère cheese, forest, forest ham, waffle, s..."
5954233ad52ca90dc28200e7,"[dijon mustard, butter, syrup, maple, maple sy..."
595424c2109c972493636f83,"[parmesan, waffle, standard, cheese, cheese pa..."
5956638625dc3d1d829b7166,"[worcestershire, worcestershire sauce, sauce, ..."


In [19]:
to_plot_tsne

Unnamed: 0_level_0,cuisine_name,x,y,cuisine_id_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
54a45bfb6529d92b2c023f25,French,-1.214609,-7.310046,4391.0
54a4638719925f464b395c16,Kosher,0.168884,19.857890,2055.0
54a441126529d92b2c01b5f2,Asian,-11.234784,-4.876577,5991.0
54a409bb19925f464b37380a,American,1.254743,-16.597135,6943.0
54a42e1019925f464b3818d4,Southwestern,-7.332714,1.062514,134.0
...,...,...,...,...
54a436266529d92b2c018767,Italian,1.654323,6.660523,2388.0
54a466e16529d92b2c026f67,French,0.585036,-12.183883,4391.0
54a451cd6529d92b2c01eefd,American,1.435304,-19.972876,6943.0
54a44e026529d92b2c01d6dd,American,-0.097224,13.647158,6943.0


In [20]:
to_plot_tsne.drop(['cuisine_id_num'], axis=1, inplace=True)

In [21]:
to_plot_tsne = to_plot_tsne.join(important_ingredients, how='inner')

In [22]:
to_plot_tsne

Unnamed: 0_level_0,cuisine_name,x,y,important_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
54a45bfb6529d92b2c023f25,French,-1.214609,-7.310046,"[onion clove, crouton, beurre, manié, beurre m..."
54a4638719925f464b395c16,Kosher,0.168884,19.857890,"[bittersweet, bittersweet chocolate, brandy, v..."
54a441126529d92b2c01b5f2,Asian,-11.234784,-4.876577,"[serrano, root, cilantro lime, shallot root, r..."
54a409bb19925f464b37380a,American,1.254743,-16.597135,"[salt chicken broth, sage, turkey, root, shall..."
54a42e1019925f464b3818d4,Southwestern,-7.332714,1.062514,"[baby, chili, paprika, barbecue, barbecue sauce]"
...,...,...,...,...
54a436266529d92b2c018767,Italian,1.654323,6.660523,"[mascarpone, mascarpone cheese, butter vegetab..."
54a466e16529d92b2c026f67,French,0.585036,-12.183883,"[roquefort, brie, roquefort cheese, vegetable ..."
54a451cd6529d92b2c01eefd,American,1.435304,-19.972876,"[spinach, baby spinach, tortilla, cilantro lim..."
54a44e026529d92b2c01d6dd,American,-0.097224,13.647158,"[egg water, cream vanilla, almond extract, mil..."


In [None]:
random_200 = to_plot_tsne.sample(200, random_state=313)

# kmeans_12 = KMeans(n_clusters=12, random_state=30, verbose=50).fit(random_200.drop(['cuisine_name', 'cuisine_id_num'], axis=1))

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = random_200['x'].min() - 1, random_200['x'].max() + 1
y_min, y_max = random_200['y'].min() - 1, random_200['y'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans_12.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
centroids = kmeans_12.cluster_centers_

maybe will have to add to random_200 the colors determined by Z above, add the colors to the kmeans centroids, could PolyAnnotations from bokeh work(?), add the labels, add hover over with the ingredient vectors

In [None]:
kebab = ColumnDataSource(random_200)
centroids_cds = ColumnDataSource(pd.DataFrame(data=centroids, columns=['x', 'y']))

HOVER_TOOLTIPS = [
    ('Cuisine', '@cuisine_name'), 
    ('Ingredients', '@important_ingredients')
]

# tooltips= dict(zip())


p = figure(title='KMeans, tSNE, Bokeh', tooltips=HOVER_TOOLTIPS)
r = p.dot(x='x', y='y', size=15, source=kebab, color='black')

p.hover.renderers=[r]

p.square_pin(centroids_cds.data['x'], centroids_cds.data['y'], size=20, color='white', fill_color=None, line_width=4)
p.image(image=[Z], x=xx.min(), y=xx.min(), dw=xx.max()-xx.min(), dh=yy.max()-xx.min(), palette="Category20_20", level="image")


# from https://docs.bokeh.org/en/latest/docs/user_guide/annotations.html#userguide-annotations

# labels = LabelSet(x='x', y='y', text='cuisine_name', source=kebab)

# p.add_layout(labels)
# Texts = [plt.text(  random_200['x'][i], 
#                                     random_200['y'][i], 
#                                     random_200['cuisine_name'][i], 
#                                     ha='center', 
#                                     va='center') 
#                         for i in range(random_200.shape[0])]
# adjust_text(Texts, arrowprops=dict(arrowstyle='->', color='red'))
# output_file(filename="KMeans on tSNE in Bokeh, 200 recipes, 12 clusters.html", title="KMeans on tSNE in Bokeh, 200 recipes, 12 clusters")

# save(p)

show(p)

In [23]:
all_kebab = ColumnDataSource(to_plot_tsne)
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = to_plot_tsne['x'].min() - 1, to_plot_tsne['x'].max() + 1
y_min, y_max = to_plot_tsne['y'].min() - 1, to_plot_tsne['y'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans_12.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
centroids = kmeans_12.cluster_centers_

centroids_cds = ColumnDataSource(pd.DataFrame(data=centroids, columns=['x', 'y']))

HOVER_TOOLTIPS = [
    ('Cuisine', '@cuisine_name'), 
    ('Ingredients', '@important_ingredients')
]

ppp = figure(title='KMeans, tSNE, Bokeh', tooltips=HOVER_TOOLTIPS)
r_whole = ppp.dot(x='x', y='y', size=15, source=all_kebab, color='black')

ppp.hover.renderers=[r_whole]

ppp.square_pin(centroids_cds.data['x'], centroids_cds.data['y'], size=20, color='white', fill_color=None, line_width=4)
ppp.image(image=[Z], x=xx.min(), y=xx.min(), dw=xx.max()-xx.min(), dh=yy.max()-xx.min(), palette="Category20_20", level="image")


# from https://docs.bokeh.org/en/latest/docs/user_guide/annotations.html#userguide-annotations

# labels = LabelSet(x='x', y='y', text='cuisine_name', source=kebab)

# p.add_layout(labels)
# Texts = [plt.text(  random_200['x'][i], 
#                                     random_200['y'][i], 
#                                     random_200['cuisine_name'][i], 
#                                     ha='center', 
#                                     va='center') 
#                         for i in range(random_200.shape[0])]
# adjust_text(Texts, arrowprops=dict(arrowstyle='->', color='red'))
output_file(filename="KMeans on tSNE in Bokeh, all recipes, 12 clusters.html", title="KMeans on tSNE in Bokeh, 200 recipes, 12 clusters")

save(ppp)

# show(ppp)



'/home/awchen/Repos/Projects/MeaLeon/notebooks/exploratory/KMeans on tSNE in Bokeh, all recipes, 12 clusters.html'

We know we can't plot all points at once, there are too many too close together to get value or meaning out of this, which already has somewhat obscured value since there's two large dimension reductions to get this plot to work

In [None]:
important_ingredients = sparse.apply(lambda x: x.iloc[important_ingreds_indices])

In [None]:
important_ingredients = []
for i in sparse.index: 
    print(i)
    # print(sparse.iloc[i].iloc[important_ingreds_indices.iloc[i]])
    important_ingredients.append(sparse.iloc[i].iloc[important_ingreds_indices.iloc[i]])

The following blocks only work when sparse's index is set to recipes_with_cv's id column

In [None]:
for i in sparse.index[0:5]: print(important_ingreds_indices.iloc[i])

In [None]:
sparse.index

In [None]:
for i in sparse.index[0:5]: print(list(sparse.columns)[i])

In [None]:
sparse.nlargest(5, columns=sparse.index, keep='all')

In [None]:
sparse.columns.tolist()

In [None]:
important_ingredients = sparse.apply(lambda x: pd.DataFrame(x).nlargest(5, columns=sparse.columns.tolist(), keep='all'))

In [None]:
sparse.loc['54a408a66529d92b2c003638']

In [None]:
sparse.loc['54a408a66529d92b2c003638'].argsort()[-5:][::-1]

In [None]:
type(sparse.loc['54a408a66529d92b2c003638'].argsort()[-5:][::-1])

In [None]:
sparse.loc['54a408a66529d92b2c003638'].argsort()[-5:][::-1].index.tolist()

In [None]:
sparse.loc['54a408a66529d92b2c003638'].loc['árbol']

In [None]:
sparse.loc['54a408a66529d92b2c003638']

In [None]:
sparse.loc['54a408a66529d92b2c003638'].argsort()

In [None]:
recipes_with_cv[recipes_with_cv['id'] == '54a408a66529d92b2c003638']

In [None]:
sparse.loc['54a408a66529d92b2c003638'][sparse.loc['54a408a66529d92b2c003638'].nonzero()]

In [None]:
sparse.loc['54a408a66529d92b2c003638'].to_numpy().nonzero()[0].tolist()

In [None]:
sparse.loc['54a408a66529d92b2c003638'].iloc[[133, 167, 562, 1519, 1712, 1781, 2085, 2273, 2596, 2603, 2614, 3055]]

In [None]:
sparse.loc['54a408a66529d92b2c003638'].argsort()[-5:].values.tolist()

In [None]:
sparse.loc['54a408a66529d92b2c003638'].argmax()

In [None]:
sparse.loc['54a408a66529d92b2c003638'].iloc[[2596, 133, 1519, 167, 1781]]

These three cells may not work, no surprise

In [None]:
recipes_with_cv['important_ingreds_indices'] = recipes_with_cv['id'].apply(lambda x: sparse.loc[x].argsort()[-5:].values.tolist())

In [None]:
recipes_with_cv['important_ingreds_indices']

In [None]:
recipes_with_cv['important_ingreds'] = recipes_with_cv.apply(lambda x: sparse.loc[x['id']].iloc[x['important_ingreds_indices']], axis=1)

In [None]:
sparse.shape

In [None]:
sparse

In [None]:
sparse['important_ingreds_indices']

In [None]:
sparse['important_ingreds_indices'][sparse['important_ingreds_indices'].List.contains(-1) == False]

In [None]:
sparse.iloc[0].iloc[[704, 1976, 1980, 2684, -1]]

In [None]:
recipes_with_cv.iloc[0]['ingredients']

In [None]:
recipes_with_cv[recipes_with_cv['id'] == '54a408a66529d92b2c003638']

In [None]:
sparse.iloc[3]

In [None]:
sparse.head()

In [None]:
recipes_with_cv.apply(lambda x: x['important_ingreds_indices'], axis=1)

In [None]:
recipes_with_cv.apply(lambda x: sparse.loc[x['id']], axis=1)

In [None]:
type(recipes_with_cv['id'])

In [None]:
print(kebab)

In [None]:
kebab.data['x']

In [None]:
kebab.selected

Add back some ingredients from the sparse word vectors, say the top 5-10 words based on tfidf score

Also display the cuisine label

Based on the answer here https://stackoverflow.com/questions/70027225/tooltips-hover-over-shows-python-bokeh

In [None]:
# this is matplotlib
plt.style.use('ggplot')
to_plot_tsne.plot.scatter(x='x', y='y', c='cuisine_id_num', colormap='tab20', figsize=(30,20), facecolors="#101010");

In [None]:
plt.figure(num=1, figsize=(25,15))
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired, 
    aspect="auto",
    origin="lower",
)

plt.plot(random_200['x'], random_200['y'], "k.", markersize=2)
# Plot the centroids as a white X

plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)

Texts = [plt.text(  random_200['x'][i], 
                                    random_200['y'][i], 
                                    random_200['cuisine_name'][i], 
                                    ha='center', 
                                    va='center') 
                        for i in range(random_200.shape[0])]
adjust_text(Texts, arrowprops=dict(arrowstyle='->', color='red'))

plt.title(
    "K-means clustering on the 200 random recipes after SVD dimension reduction into tSNE\n"
    "Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

plt.savefig(f"{kmeans_12.get_params()['n_clusters']}_clusters-{kmeans_12.get_params()['random_state']}_rand-state.png")
plt.show()

In [None]:
p2 = figure(title='KMeans, tSNE, Bokeh')

p2.image(image=[Z], x=0, y=0, dw=xx.max()-xx.min(), dh=yy.max()-yy.min(), palette="Purples256", level="image")

show(p2)

In [None]:
Z

In [None]:
x1 = np.linspace(0, 10, 250)
y1 = np.linspace(0, 10, 250)
xx1, yy1 = np.meshgrid(x1, y1)
d = np.sin(xx1)*np.cos(yy1)

In [None]:
d

In [None]:
p = figure(width=400, height=400)
p.x_range.range_padding = p.y_range.range_padding = 0

p.image(image=[d], x=0, y=0, dw=xx.max()-xx.min(), dh=yy.max()-yy.min(), palette="Purples256", level="image")
p.grid.grid_line_width = 0.5
show(p)