# Cuisine Style Transfer EDA

This notebook file contains exploratory data analysis for the recipes 
from https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions

Author: Aaron W Chen

---

# Import necessary libraries

In [40]:
import json
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
import string
import ast
import gensim 
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from operator import itemgetter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.manifold import TSNE
from sklearn.manifold.t_sne import (_joint_probabilities,
                                    _kl_divergence)
from sklearn.utils.extmath import _ravel
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

---

# Define functions needed for later work

In [2]:
def _flatten(list_of_lists):
    for x in list_of_lists:
        if hasattr(x, '__iter__') and not isinstance(x, str):
            for y in _flatten(x):
                yield y
        else:
            yield x

In [3]:
def dummy_fun(doc):
    return doc

In [4]:
def tokenizer(doc):
    for sen in doc:
        tokens = [word_tokenize(word) for word in sen]
        flat_tkns = list(_flatten(tokens))
        stopped_tokens = [w.lower() for w in flat_tkns if w not in stopwords_list]
        lemma_tokens = [lemmatizer.lemmatize(token) for token in stopped_tokens]
        token_recipes.append(lemma_tokens)

---

# Import raw recipes csv into a dataframe

In [5]:
raw_file_str = "../food-com-recipes-and-user-interactions/RAW_recipes.csv"
df_file = pd.read_csv(raw_file_str)

In [6]:
df_file.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


# Begin preparation to turn ingredient lists into vectors

In [7]:
# Using some stopwords from https://github.com/AlludedCrabb/sound-tasty
cooking_stop_words = list(set([
        'canned', 'cans', 'drained', 'and', 'halved', 'cup', 'cups',
        'teaspoon', 'tablespoon', 'teaspoons', 'tablespoons',
        'finely', 'freshly', 'fresh', 'thickcut', 'to', 'taste',
        'grated', 'cut', 'into', 'wedges', 'pounds', 'unpeeled', 'large',
        'minced', 'slice', 'slices', 'sliced', 'thick-cut', 'cut',
        'crosswise', 'pieces', 'toothpicks', 'low-fat', 'chopped', 'or',
        'taste', 'cooked', 'dry', 'shredded', 'beaten', 'dried', 'melted',
        'stems', 'removed', 'diced', 'ounce', 'ounces', 'packages',
        'softened', 'such', 'RedHot®', 'RedHot', 'Franks', "Frank's",
        'crumbled', 'Old', 'Bay®', 'Bay', 'pinch', 'for', 'garnish', 'slice',
        'slices', 'needed', 'inch', 'cubes', 'cooking', 'spray', 'ground',
        'rotisserie', 'lowfat', 'as', 'quarteres', 'cloves', 'more', 'can',
        'package', 'frozen', 'thawed', 'packet', 'reducedfat', 'Knorr',
        'container', 'pound', 'peeled', 'deveined', 'seeded', 'ripe',
        'English', 'juiced', 'plus', 'more', 'Hass', 'cubed', 'Mexicanstyle',
        'hearts', 'prepared', 'party', 'pitted', 'mashed',
        'roma', 'optional', 'chunk', 'Hot', 'bunch', 'cleaned', 'box',
        'chickenflavored', 'Golden', 'delicious', 'cored', 'any', 'flavor',
        'flavored', 'whole', 'allpurpose', 'all', 'purpose', 'deep', 'frying',
        'dash', 'packed', 'in', 'French', 'jar', 'small', 'head', 'little',
        'smokie', 'seasoned', 'Boston', 'Bibb', 'leaves', 'lean', 'pickled',
        'Asian', 'dark', 'flaked', 'rolled', 'packed', 'jellied',
        'thirds', 'with', 'attached', 'skewers', 'skinless', 'boneless',
        'half', 'kernels', 'rinsed', 'quart', 'quarts', 'kernel',
        'Italianstyle', 'unpopped', 'lightly', 'coating', 'SAUCE',
        'lengthwise', 'miniature', 'semisweet', 'rinsed', 'round',
        'squeezed', 'stewed', 'raw', 'the', 'liquid', 'reserved', 'medium',
        'instant', 'solid', 'pack', 'refrigerated', 'halves', 'distilled',
        'loaf', 'extra', 'virgin', 'crushed', 'kosher', 'toasted', 'buttery',
        'TM', 'panko', 'Japanese', 'regular', 'bottle', 'bottles', 'thin',
        'peel', 'paper', 'thick', 'circles', 'unbleached',
        'breast', 'breasts', 'wings', 'strips', 'jumbo', 'giant', 'chunks',
        'quickcooking', 'sweetened', 'flakes', 'Ranchstyle', 'snipped',
        'food', 'ROTEL', 'Italian', 'sticks', 'stick', 'crescent', 'thinly',
        'boiled', 'Genoa', 'roasted', 'thin', 'extrasharp', 'pressed',
        'sifted', 'split', 'tips', 'discarded', 'mini', 'deli', 'drain',
        'reserve', 'diameter', 'Greek', 'Thai', 'drops', 'square', 'crusty',
        'American', 'selfrising', 'imitation', 'Wings', 'apart', 'at',
        'joints', 'wing', 'tips', 'discarded', 'parts',
        'tops', 'seperated', 'blend', 'coarsely', 'sweet', 'stalk', 'heads',
        'husked', 'divided', 'pats', 'unsalted', 'active', 'warm', 'sea',
        'separated', 'herb', 'overripe', 'degrees', 'F', 'C', 'room',
        'temperature', 'machine', 'very', 'pint', 'puree', 'coarse',
        'envelopes', 'lukewarm', 'creamstyle', 'unsweetened',
        'lite', 'of', 'chilled', 'freezer', 'cold', 'brushing', 'nonfat',
        'squares', 'tails', 'thigh', 'quarters', 'Masterpiece', 'KC', 'from',
        'El', 'Paso', 'bulk', 'Hunts', 'Roma', 'light', 'fluid', 'lagerstyle',
        'stalks', 'quartered', 'undrained', 'drained', 'Tony', 'Chacheres',
        'lump', 'uncooked', 'cube', 'bits', 'hair', 'angel', 'trimmed',
        'stew', 'spaghetti', 'brisket', 'bitesized', 'matchstick', 'Chobani',
        'unbaked', 'crust', 'torn', 'bonein', 'pounded', 'bitesize',
        'granules', 'boiling', 'yolk', 'coloring', 'pinch', 'a', 'blender',
        'fine', 'which', 'extralarge', 'use', 'will', 'make', 'garnish',
        'barely', 'moistened', 'about', 'right', 'before', 'serving', 'mix',
        
    ]))

In [8]:
unhelpful = list(set(['fresh', 'ripe', 'cracked', 'cooking', 'coarse', 'light', 
             'mild', 'hot', 'minced', 'dark roast', 'unsifted', 'canned', 
             'cans', 'drained', 'halved', 'finely', 'freshly', 'thickcut', 
             'grated', 'cut', 'unpeeled', 'large', 'minced', 'slice', 
             'slices', 'sliced', 'chopped','shredded', 'beaten', 'dried', 
             'melted', 'stems', 'softened', 'packages', 'crumbled', 'ground',
             'low-fat', 'rotisserie', 'lowfat', 'can', 'thawed', 'packet', 
             'reducedfat', 'small', 'pats', 'regular', 'lukewarm', 'mashed', 
             'stalk', 'breast', 'breasts', 'juiced', 'halves', 'extrasharp', 
             'sharp', 'extra sharp', 'frozen', 'raw', 'warm', 'divided', 
             'little', 'squares', 'thinly', 'thick', 'rinsed', 'toasted', 
             'bitesize', 'chunks', 'refrigerated', 'kernel', 'kernels', 
             'jar', 'lengthwise', 'unpeeled', 'cleaned', 'paper', 'melted', 
             'separated', 'seperated', 'deveined', 'party', 'bunch', 'overripe', 
             'boiled', 'chunk', 'container', 'bitesized', 'sweet', 'strips', 
             'sifted', 'roma', 'very', 'undrained', 'stewed', 'thawed', 'lean', 
             'roasted', 'extra', 'lite', 'coarsely', 'pressed', 'square', 
             'jumbo', 'yolk', 'yolks', 'barely', 'pitted', 'cored', 'puree', 
             'cubes', 'angel', 'hair', 'angelhair', 'giant', 'husked', 'chilled', 
             'thigh', 'trimmed', 'thin', 'lightly', 'cubed', 'drops', 'grated', 
             'boneless', 'unsalted', 'pieces', 'skinless', 'pounded', 
             'chickenflavored', 'extralarge', 'medium', 'reserve', 'unbaked', 
             'crushed', 'wings', 'crosswise', 'cold', 'bonein', 'bone in', 
             'squeezed', 'kosher', 'miniature', 'tails', 'quarters', 'attached', 
             'loaf', 'dry', 'more', 'head', 'removed', 'packed', 'hearts', 
             'matchstick', 'unbleached', 'heads', 'stems', 'sea', 'diced', 
             'mini', 'cut', 'unpopped', 'box', 'uncooked', 'freezer', 'stalks', 
             'shredded', 'halved', 'snipped', 'thick-cut', 'split', 'seeded', 
             'sweetened', 'discarded', 'lump', 'boiling', 'whole', 'semisweet', 
             'semi-sweet', 'quartered', 'moistened', 'reserved', 'prepared', 
             'fresh', 'ripe', 'cracked', 'cooking', 'coarse', 'light', 'mild', 
             'hot', 'minced', 'dark roast', 'unsifted', 'quaker', 'raw', 'frozen', 
             'calore-wise', 'ziploc bag', 'real', 'lite', 'crisp', 'decaffeinated', 
             'canned', 'processed', 'cooked', 'unpeeled', ]))

In [9]:
brands = ['rotel', 'absolut', 'betty crocker', 'jello', 'diana', 'ener-g', 
          'del-monte', "hunt's", 'martha', 'goya', 'cracker barrel', 
          'hamburger helper', "mccormick's", 'pepperidge farm', 'knorr', 
          'godiva', 'hidden valley', 'tabasco', 'branston', "kellogg's", 
          'hodgson mill', 'kraft', 'johnsonville', 'jim beam', 'mccormick', 
          'equal', 'jell-o', 'jimmy dean', 'country bob', "smucker's", 
          'toblerone', 'gerber', 'nestle', 'nestl', 'malt-o-meal', 'triscuit', 
          'ragu', 'campbell', 'hormel', 'earth balance', 'pillsbury', 
          "bird's eye", "campbell's", "betty crocker's", 'gold medal', 
          'crystal light', 'milnot', "land o' lakes", 'herb-ox', 'quaker',
          'coffee-mate', 'contadina', 'j&d', 'fantastic foods', 'bacardi', 
          'eckrich', 'little smokies', 'snickers', 'ortega', 'bayou blast', 
          "annie's", 'mrs. dash', 'mori-nu', 'old el paso', 'original supreme',
          'morton', 'nabisco', 'rice-a-roni', 'stolichnaya', "lawry's", 
          'st. germain', "eggland's best", 'club house "lagrille"', 'hostess',
          'giada de laurentiis genovese', '*available in most target stores', 
          'jarlsberg', 'pillsbury plus', 'ro-tel', 'pillsbury grands', 
          'shilling', 'hershey', 'hershey carb alternatives', 'pasta roni', 
          'pastaroni', 'torani', 'v8', 'v8 fusion', 'ghiradelli', 'oscar mayer',
          "bird's", 'smithfield', 'cadbury', 'sun-maid', 'karo', 
          'wishbone deluxe', 'vochelle', 'laughing cow', 'omega', 'stirrings',
          'duncan hines', 'barilla',
         ]

In [10]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += unhelpful
stopwords_list += brands
lemmatizer = WordNetLemmatizer()

In [11]:
flattened = _flatten(df_file['ingredients'])

In [12]:
big_list = []
for row in df_file['ingredients']:
    big_list.append(ast.literal_eval(row))
    

In [13]:
big_list[:5]

[['winter squash',
  'mexican seasoning',
  'mixed spice',
  'honey',
  'butter',
  'olive oil',
  'salt'],
 ['prepared pizza crust',
  'sausage patty',
  'eggs',
  'milk',
  'salt and pepper',
  'cheese'],
 ['ground beef',
  'yellow onions',
  'diced tomatoes',
  'tomato paste',
  'tomato soup',
  'rotel tomatoes',
  'kidney beans',
  'water',
  'chili powder',
  'ground cumin',
  'salt',
  'lettuce',
  'cheddar cheese'],
 ['spreadable cheese with garlic and herbs',
  'new potatoes',
  'shallots',
  'parsley',
  'tarragon',
  'olive oil',
  'red wine vinegar',
  'salt',
  'pepper',
  'red bell pepper',
  'yellow bell pepper'],
 ['tomato juice',
  'apple cider vinegar',
  'sugar',
  'salt',
  'pepper',
  'clove oil',
  'cinnamon oil',
  'dry mustard']]

In [14]:
tags = []
for row in df_file['tags']:
    tags.append(ast.literal_eval(row))

all_tags = list(_flatten(tags))
all_tags

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash',
 '30-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'breakfast',
 'main-dish',
 'pork',
 'american',
 'oven',
 'easy',
 'kid-friendly',
 'pizza',
 'dietary',
 'northeastern-united-states',
 'meat',
 'equipment',
 'time-to-make',
 'course',
 'preparation',
 'main-dish',
 'chili',
 'crock-pot-slow-cooker',
 'dietary',
 'equipment',
 '4-hours-or-less',
 '60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'occasion',
 'side-dishes',
 'eggs-dairy',
 'potatoes',
 'vegetables',
 'oven',
 'easy',
 'dinner-party',
 'holiday-event',
 'easter',
 'cheese',
 'stove-top',
 'dietary',
 'christmas',
 'new-ye

In [None]:
with open('../write_data/all_tags.txt', 'w') as filehandle:
    for listitem in all_tags:
        filehandle.write('%s\n' % listitem)

In [15]:
token_recipes = []
bigram_recipes = []
trigram_recipes = []
for recipe in big_list:
    tokens = [word_tokenize(word) for word in recipe]
    flat_tkns = list(_flatten(tokens))
    stopped_tokens = [w.lower() for w in flat_tkns if w not in stopwords_list]
    lemma_tokens = [lemmatizer.lemmatize(token) for token in stopped_tokens]
    token_recipes.append(lemma_tokens)
    bigram_recipes.append(list(ngrams(lemma_tokens, 2)))
    trigram_recipes.append(list(ngrams(lemma_tokens, 3)))

In [16]:
token_recipes

[['winter',
  'squash',
  'mexican',
  'seasoning',
  'mixed',
  'spice',
  'honey',
  'butter',
  'olive',
  'oil',
  'salt'],
 ['pizza',
  'crust',
  'sausage',
  'patty',
  'egg',
  'milk',
  'salt',
  'pepper',
  'cheese'],
 ['beef',
  'yellow',
  'onion',
  'tomato',
  'tomato',
  'paste',
  'tomato',
  'soup',
  'tomato',
  'kidney',
  'bean',
  'water',
  'chili',
  'powder',
  'cumin',
  'salt',
  'lettuce',
  'cheddar',
  'cheese'],
 ['spreadable',
  'cheese',
  'garlic',
  'herb',
  'new',
  'potato',
  'shallot',
  'parsley',
  'tarragon',
  'olive',
  'oil',
  'red',
  'wine',
  'vinegar',
  'salt',
  'pepper',
  'red',
  'bell',
  'pepper',
  'yellow',
  'bell',
  'pepper'],
 ['tomato',
  'juice',
  'apple',
  'cider',
  'vinegar',
  'sugar',
  'salt',
  'pepper',
  'clove',
  'oil',
  'cinnamon',
  'oil',
  'mustard'],
 ['milk', 'vanilla', 'ice', 'cream', 'apple', 'juice', 'concentrate', 'apple'],
 ['fennel',
  'seed',
  'green',
  'olive',
  'olive',
  'garlic',
  'peppe

In [17]:
bigram_recipes

[[('winter', 'squash'),
  ('squash', 'mexican'),
  ('mexican', 'seasoning'),
  ('seasoning', 'mixed'),
  ('mixed', 'spice'),
  ('spice', 'honey'),
  ('honey', 'butter'),
  ('butter', 'olive'),
  ('olive', 'oil'),
  ('oil', 'salt')],
 [('pizza', 'crust'),
  ('crust', 'sausage'),
  ('sausage', 'patty'),
  ('patty', 'egg'),
  ('egg', 'milk'),
  ('milk', 'salt'),
  ('salt', 'pepper'),
  ('pepper', 'cheese')],
 [('beef', 'yellow'),
  ('yellow', 'onion'),
  ('onion', 'tomato'),
  ('tomato', 'tomato'),
  ('tomato', 'paste'),
  ('paste', 'tomato'),
  ('tomato', 'soup'),
  ('soup', 'tomato'),
  ('tomato', 'kidney'),
  ('kidney', 'bean'),
  ('bean', 'water'),
  ('water', 'chili'),
  ('chili', 'powder'),
  ('powder', 'cumin'),
  ('cumin', 'salt'),
  ('salt', 'lettuce'),
  ('lettuce', 'cheddar'),
  ('cheddar', 'cheese')],
 [('spreadable', 'cheese'),
  ('cheese', 'garlic'),
  ('garlic', 'herb'),
  ('herb', 'new'),
  ('new', 'potato'),
  ('potato', 'shallot'),
  ('shallot', 'parsley'),
  ('parsley',

In [18]:
trigram_recipes

[[('winter', 'squash', 'mexican'),
  ('squash', 'mexican', 'seasoning'),
  ('mexican', 'seasoning', 'mixed'),
  ('seasoning', 'mixed', 'spice'),
  ('mixed', 'spice', 'honey'),
  ('spice', 'honey', 'butter'),
  ('honey', 'butter', 'olive'),
  ('butter', 'olive', 'oil'),
  ('olive', 'oil', 'salt')],
 [('pizza', 'crust', 'sausage'),
  ('crust', 'sausage', 'patty'),
  ('sausage', 'patty', 'egg'),
  ('patty', 'egg', 'milk'),
  ('egg', 'milk', 'salt'),
  ('milk', 'salt', 'pepper'),
  ('salt', 'pepper', 'cheese')],
 [('beef', 'yellow', 'onion'),
  ('yellow', 'onion', 'tomato'),
  ('onion', 'tomato', 'tomato'),
  ('tomato', 'tomato', 'paste'),
  ('tomato', 'paste', 'tomato'),
  ('paste', 'tomato', 'soup'),
  ('tomato', 'soup', 'tomato'),
  ('soup', 'tomato', 'kidney'),
  ('tomato', 'kidney', 'bean'),
  ('kidney', 'bean', 'water'),
  ('bean', 'water', 'chili'),
  ('water', 'chili', 'powder'),
  ('chili', 'powder', 'cumin'),
  ('powder', 'cumin', 'salt'),
  ('cumin', 'salt', 'lettuce'),
  ('salt

# TFIDF with individual word vectors

In [19]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(analyzer='word', 
                                                        tokenizer=dummy_fun, 
                                                        preprocessor=dummy_fun, 
                                                        token_pattern=None, 
                                                        stop_words=stopwords_list, 
                                                        min_df=2,
                                                       )
tfidf.fit(token_recipes)
response = tfidf.transform(token_recipes)

  'stop_words.' % sorted(inconsistent))


In [20]:
nlpskl = pd.DataFrame(response.toarray(), 
                      columns=tfidf.get_feature_names(), 
                      index=df_file['name'])
nlpskl

Unnamed: 0_level_0,'','s,1,10,10-inch,10-minute,100,100-calorie,10x,12-inch,...,zatarians,zero,zest,zesty,zinfandel,zinger,ziploc,ziti,zucchini,zwieback
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
arriba baked winter squash mexican style,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
a bit different breakfast pizza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
all in the kitchen chili,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
alouette potatoes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
amish tomato ketchup for canning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
apple a day milk shake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
aww marinated olives,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
backyard style barbecued ribs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
bananas 4 ice cream pie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
beat this banana bread,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [21]:
train, test = train_test_split(nlpskl)

In [22]:
kmeans = KMeans(n_clusters=200, max_iter=6000, algorithm = 'auto')
kmeans.fit(train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=6000,
    n_clusters=200, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [24]:
labels = kmeans.predict(train)

In [28]:
labels.shape

(173727,)

In [36]:
labels

array([ 11,  50, 121, ..., 131, 141,  82], dtype=int32)

In [37]:
train['labels'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
train['labels']

name
cherry shredded pork wrap                                 11
low carb stuffed cabbage casserole                        50
easy  grilled  burgers italiano for the bbq              121
california soft tacos                                     58
kesari bhat                                               98
baked whiting with bok choy and vermouth                 107
red river rub                                            144
super easy spicy beef minestrone soup                     11
kentucky slaw                                             50
watermelon salad with feta and mint                       91
chocolate chewy brownies                                 141
extremely tasty hot and spicy tropical barbecue sauce    161
venison meat  canned                                      64
apple cinnamon muffins                                   173
cranberry  fruit coleslaw                                 50
austrian raspberry blueberry shortbread                  158
baby bok choy salad

In [None]:
proj = TSNE(random_state=15).fit_transform(train.drop('labels', axis=1))

# Cosine Similarity

In [None]:
nlpskl.iloc[:5]

In [None]:
print(cosine_similarity(nlpskl.iloc[:5], nlpskl.iloc[:5]))
#print(cosine_similarity(nlpskl.iloc[:5], nlpskl.iloc[5:10]))

# Soft Cosine Similarity

# TFIDF with bigrams instead of individual word tokens

In [None]:
tfidf2 = sklearn.feature_extraction.text.TfidfVectorizer(analyzer='word', 
                                                        tokenizer=dummy_fun, 
                                                        preprocessor=dummy_fun, 
                                                        token_pattern=None)
tfidf2.fit(bigram_recipes)
response2 = tfidf2.transform(bigram_recipes)

In [None]:
nlpskl2 = pd.DataFrame(response2.toarray(), 
                       columns=tfidf2.get_feature_names(), 
                       index=df_file['name'])
nlpskl2

---

# Tokenize the names to see if similarities exist between titles

In [None]:
token_names = []
for row in df_file['name']:
    row_str = str(row)
    tokens_n = word_tokenize(row_str)
    stopped_tokens_n = [w.lower() for w in tokens_n if w not in stopwords_list]
    lemma_tokens_n = [lemmatizer.lemmatize(token) for token in stopped_tokens_n]
    token_names.append(lemma_tokens_n)

In [None]:
token_names

---

In [None]:
wv = Word2Vec()

---

In [None]:
big_list_names = [word_tokenize(row) for row in df_file['name']]


In [None]:
big_list_tokenized = [[lemmatizer.lemmatize(w) for w in token] for token in stopped_tokens]

In [None]:
flattened = _flatten(big_list)

In [None]:
one_list = list(flattened)

In [None]:
one_list

In [None]:
len(one_list)

In [None]:
uniques = set(one_list)

In [None]:
len(list(uniques))

In [None]:
uniques

In [None]:
with open('../write_data/unique_ingred.txt', mode='w', encoding='utf-8') as myfile:
    for ingred in list(uniques):
        myfile.write('%s\n' % ingred)

In [None]:
def process_article(article):
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed 

In [None]:
ingred_freqdist = FreqDist(one_list)

In [None]:
ingred_freqdist.most_common(200)

In [None]:
ingred_freqdist.most_common()[:500:-1]

In [None]:
def count_vectorize(ingredients, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(ingredients))
    
    ingred_dict = {i:0 for i in unique_words}
    
    for word in ingredients:
        ingred_dict[word] += 1
        
    return ingred_dict

In [None]:
test = count_vectorize(one_list)
test

In [None]:
for key, value in sorted(test.items(), key = itemgetter(1), reverse = True):
    print(key, value)

---

In [None]:
stopped_tokens = []
for item in one_list:
    tokens = word_tokenize(item)
    stopped_tokens.append(w.lower() for w in tokens if w not in stopwords_list)

In [None]:
stopped_tokens

In [None]:
list(stopped_tokens[0])

In [None]:
lemma_tokens = [lemmatizer.lemmatize(w) for token in stopped_tokens for w in token]

In [None]:
lemma_tokens

In [None]:
all_tokens = [_flatten(list(stopped_token)) for stopped_token in stopped_tokens]

In [None]:
all_tokens

In [None]:
tokenized = []

for listed in one_list:
    tokenized.append(simple_preprocess(listed, min_len=2))

In [None]:
tokenized

In [None]:
flat_tokenized = list(_flatten(tokenized))

In [None]:
ingred_tokens_freqdist = FreqDist(flat_tokenized)

In [None]:
ingred_tokens_freqdist.most_common(200)

In [None]:
ingred_tokens_freqdist.most_common()[:200:-1]

In [None]:
flat_tokenized

---

In [None]:
big_list_names = []
for row in df_file['name']:
    big_list_names.append(ast.literal_eval(row))
one_list_names = list(_flatten(big_list))
one_list_names
len(one_list_names)
len(set(one_list_names))

In [None]:
all_names = list(df_file['name'].values)

In [None]:
replace = {'rotel': 'canned',
          'red bell pepper': 'bell_pepper',
          'yellow bell pepper': 'bell_pepper',
          'green bell pepper': 'bell_pepper',
          }

In [None]:
impute = ['vegan', 'vegetarian']

---

# Test cells

In [None]:
test= df_file['ingredients'][1]
test

In [None]:
x = ast.literal_eval(test)
x

In [None]:
x[0]

In [None]:
x2 = [n.strip() for n in x]
x2

In [None]:
list(flattened)

In [None]:
big_list[0]

In [None]:
test2 = _flatten(big_list[0])
list(big_list[0])

In [None]:
tknz = []
for item in big_list[0]:
    tknz.append(word_tokenize(item))
list(_flatten(tknz))

In [None]:
test3 = _flatten(big_list)
list(test3)

In [None]:
testfidf = sklearn.feature_extraction.text.TfidfVectorizer(analyzer='word', 
                                                        token_pattern=None, 
                                                        stop_words=stopwords_list, 
                                                        min_df=2,
                                                       )

In [None]:
testfidf.fit(big_list)