In [16]:
import gensim as gensim
import networkx
import numpy as np  
import os
import pandas as pd  
import glob
import json
import gc
import random
import re
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk import flatten
from pickle import load
from pickle import dump

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
# Document normalize function
def normalize_document(txt):
    txt = txt.lower()
    txt = txt.strip()
    tokens = nltk.word_tokenize(txt)
    clean_tokens = [t for t in tokens if t not in stop_words]
    wordnet_lem = [WordNetLemmatizer().lemmatize(w) for w in clean_tokens]
    stems = [nltk.stem.SnowballStemmer('english').stem(w) for w in wordnet_lem]
    return ' '.join(stems)

In [0]:
# Returning dataframe with normalized data
def normalizedf(df_ini):
  df_ini = df_ini
  x = []
  for i in df_ini['ingredients']:
      i = ' '.join(i)
      #print(i)
      x.append(normalize_document(i))
  df = df_ini
  df["normalized_ing"] = x
  return df

In [0]:
# Converting into vector form
def tfidfvecinput(input,df):
  input = input
  joined_input = []
  input = ','.join(input)
  joined_input.append(input)
  input_normalized = []
  input_normalized.append(normalize_document(joined_input[0]))
  y = []
  for every in df["normalized_ing"]:
      y.append(every)
  y.insert(0,input_normalized[0])
  vectorizer = TfidfVectorizer(stop_words='english')
  tfidf_matrix_train = vectorizer.fit_transform(y).todense()
  return tfidf_matrix_train

In [0]:
# Finding cosine similarity score with respect to input
# Reference :- http://carrefax.com/new-blog/2017/7/4/cosine-similarity
def cosinesim(mat,df):
  simi = cosine_similarity(mat[0:1], mat).flatten()
  #print(len(simi[1:]))
  #print(len(df))
  df["score"] = simi[1:]
  return df

In [0]:
# Getting Top N ID's that are related to our input
def getcloseid(df,N):
  final_df = df.sort_values(by=["score"],ascending=False)
  result = final_df[["id","score"]].head(N)
  return result

In [0]:
# Saving model
def savemodel(df):
  vectorizer = TfidfVectorizer(stop_words='english')
  tfidf_matrix = vectorizer.fit_transform(df["normalized_ing"]).todense()
  cuisins = df["cuisine"]
  lb = LabelEncoder()
  Y = lb.fit_transform(df["cuisine"])
  X = tfidf_matrix
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
  clf = RandomForestClassifier()
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)*100  
  dump(clf, open('Rfcmodel.pkl', 'wb'))
  print(accuracy)
  return accuracy

In [0]:
# Predicting cuisine based upon the input ingredients
def predictcuisine(input,model,df):
  joined_input = []
  input = ','.join(input)
  joined_input.append(input)
  joi = []
  joi.append(normalize_document(joined_input[0]))
  vectorizer = TfidfVectorizer(stop_words='english')
  tfidf_matrix = vectorizer.fit_transform(df["normalized_ing"]).todense()
  ini = vectorizer.transform([joi[0]]).todense()
  y_predin = model.predict(ini)
  lb = LabelEncoder()
  Y = lb.fit_transform(df["cuisine"])
  return lb.inverse_transform(y_predin)
 

## ***Execution*** 


In [25]:

input= ["coriander powder","ground turmeric","red pepper flakes","japanese eggplants","plums","grated parmesan cheese","fresh parsley","tomatoes with juice"]
df_ini = pd.read_json('yummly.json')
norm_df = normalizedf(df_ini)
mat = tfidfvecinput(input,norm_df)
updated_df = cosinesim(mat,norm_df)
N = 40
if not os.path.exists('Rfcmodel.pkl'):
  accu = savemodel(updated_df)
else:
  print("model is already executed with",accu)
model = load(open('Rfcmodel.pkl', 'rb'))
print("Top10 ID's for given input")
print(getcloseid(updated_df,N))
print("Cuisine for given input",predictcuisine(input,model,df_ini))

model is already executed with 75.14770584538026
Top10 ID's for given input
          id     score
26586  39414  0.527524
25495  15840  0.507473
25454  22654  0.492945
34394  36213  0.484154
26117  24176  0.461936
14124  14472  0.460664
25018   1500  0.459376
3214   40638  0.446175
30326  17469  0.433155
19777  46787  0.429784
13985  37784  0.420847
21633  14661  0.419719
32624  22292  0.418027
35776  26639  0.414957
9290    1131  0.411977
34723  35183  0.410843
15372  20021  0.407432
37774  18966  0.404435
9351   35415  0.401782
32712  35985  0.399665
39704   1923  0.399622
38668  43074  0.399397
38808  12592  0.397190
1444   20429  0.396928
19909  18122  0.394279
5907   48958  0.392626
9479     740  0.391821
23027   6974  0.391534
39691  33890  0.391122
2022   40370  0.389360
9169   20096  0.388881
36142  11114  0.386660
16720  29324  0.381516
14816  37172  0.378523
32961  20125  0.377804
13     41995  0.376809
38885  21193  0.376413
32890  19138  0.376034
37048  14833  0.373698
2856