# Tagsplanation
## Loading stuff

In [54]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
file_name_filtered = "filtered_df"
file_name_recipes = "RAW_recipes"
csv_extension = ".csv"
directory = "datasets/" 

In [4]:
id = 'id'
tags = 'tags'
tags_norm = 'tags_normalized'
tags_vec = 'tags_vectorized'
FILTERED_COLUMNS = ["user_id", "recipe_id", "date", "rating", "review"]
RECIPE_COLUMNS = [id, tags]

In [5]:
df_filtered = pd.read_csv(directory + file_name_filtered + csv_extension)
df_recipe = pd.read_csv(directory + file_name_recipes + csv_extension)
df_filtered = df_filtered[FILTERED_COLUMNS]
df_recipe = df_recipe[RECIPE_COLUMNS]
df_filtered.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
1,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo..."
2,76535,134728,2005-09-02,4,Very good!
3,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...
4,136726,197160,2006-11-25,5,I used this mix to make meat balls.Very simple...


In [6]:
df_recipe.head()

Unnamed: 0,id,tags
0,137739,"['60-minutes-or-less', 'time-to-make', 'course..."
1,31490,"['30-minutes-or-less', 'time-to-make', 'course..."
2,112140,"['time-to-make', 'course', 'preparation', 'mai..."
3,59389,"['60-minutes-or-less', 'time-to-make', 'course..."
4,44061,"['weeknight', 'time-to-make', 'course', 'main-..."


## Fixing tags data type and structure

In [7]:
forbidden_characters = [',', '[', ']', ' ', '', '\'']
all_tags = []
df_recipe[tags_norm] = ''
for id, value in df_recipe.iterrows():
    curr_tag = value[tags]
    curr_word = str()
    for curr_char in curr_tag:
        if curr_char not in forbidden_characters:
            curr_word+=curr_char
        else:
            if curr_word != '':
                all_tags.append(curr_word.lower())
                # this method might create an error if you have a different
                # version of pandas than me:
                df_recipe.set_value(id, tags_norm, all_tags)
                curr_word = str()
    all_tags = []

  from ipykernel import kernelapp as app


In [8]:
df_recipe.head()

Unnamed: 0,id,tags,tags_normalized
0,137739,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai..."
1,31490,"['30-minutes-or-less', 'time-to-make', 'course...","[30-minutes-or-less, time-to-make, course, mai..."
2,112140,"['time-to-make', 'course', 'preparation', 'mai...","[time-to-make, course, preparation, main-dish,..."
3,59389,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai..."
4,44061,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi..."


## Word2vec

In [9]:
model1 = gensim.models.Word2Vec(df_recipe.tags_normalized, min_count = 1)

In [10]:
model1.wv.most_similar('weeknight')

[('60-minutes-or-less', 0.8781963586807251),
 ('30-minutes-or-less', 0.7874325513839722),
 ('time-to-make', 0.7398199439048767),
 ('lactose', 0.7229251861572266),
 ('course', 0.6985160708427429),
 ('main-ingredient', 0.6447389125823975),
 ('15-minutes-or-less', 0.6116815805435181),
 ('cuisine', 0.604736328125),
 ('preparation', 0.515422523021698),
 ('south-west-pacific', 0.5001165866851807)]

In [11]:
model1.wv.similarity('weeknight', 'celebrity')

0.4921438

In [12]:
all_vec = []
df_recipe[tags_vec] = ''
for id, value in df_recipe.iterrows():
    for tag in value[tags_norm]:
        all_vec.append(model1.wv[tag])
    df_recipe.set_value(id, tags_vec, all_vec)# might generate error, see above
    all_vec = []

  


## Train ML model

In [13]:
df_filtered.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
1,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo..."
2,76535,134728,2005-09-02,4,Very good!
3,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...
4,136726,197160,2006-11-25,5,I used this mix to make meat balls.Very simple...


In [14]:
df_recipe.rename({id:'recipe_id'}, axis=1, inplace=True)

In [15]:
print(len(df_recipe))
df_recipe = df_recipe[df_recipe['id'].isin(df_filtered['recipe_id'].to_list())]
print(len(df_recipe))
df_recipe.reset_index(inplace=True, drop=True)
df_recipe.head()

231637
103662


Unnamed: 0,id,tags,tags_normalized,tags_vectorized
0,31490,"['30-minutes-or-less', 'time-to-make', 'course...","[30-minutes-or-less, time-to-make, course, mai...","[[-0.008353826, -2.0104654, -1.60594, 2.132089..."
1,67888,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi...","[[1.7261446, -2.657304, -2.0291271, 0.72138834..."
2,70971,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi...","[[1.7261446, -2.657304, -2.0291271, 0.72138834..."
3,75452,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi...","[[1.7261446, -2.657304, -2.0291271, 0.72138834..."
4,67547,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi...","[[1.7261446, -2.657304, -2.0291271, 0.72138834..."


In [16]:
df_full = pd.merge(df_filtered, df_recipe, left_on='recipe_id', right_on='id', how='left')

In [17]:
del df_full['tags']
#del df_full['tags_normalized']
del df_full['review']
del df_full['date']

In [18]:
df_full.head()

Unnamed: 0,user_id,recipe_id,rating,id,tags_normalized,tags_vectorized
0,57222,85009,5,85009,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371..."
1,124416,120345,0,120345,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371..."
2,76535,134728,4,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,..."
3,255338,134728,5,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,..."
4,136726,197160,5,197160,"[30-minutes-or-less, time-to-make, course, mai...","[[-0.008353826, -2.0104654, -1.60594, 2.132089..."


In [19]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in list(model1.wv.index_to_key)]
    #doc = [word for level in doc for word in level if word in list(model1.wv.index_to_key)]
    #print('doc', doc)
    return np.mean(model1.wv[doc], axis=0)

In [20]:
doc_1 = df_recipe['tags_normalized'][0]

In [49]:
df_full['doc_vec'] = ''
for index, value in df_full.iterrows():
    if value['tags_normalized'] != '':
        curr_arr = np.asarray(document_vector(value['tags_normalized']))
        if len(curr_arr) == 100:
            df_full.set_value(index, 'doc_vec', curr_arr)
    else:
        df_full.set_value(index, 'doc_vec', np.asarray(np.zeros(100)))
df_full.head()

  
  


Unnamed: 0,user_id,recipe_id,rating,id,tags_normalized,tags_vectorized,doc_vec
0,57222,85009,5,85009,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371...","[-0.1545624, -0.92504364, 0.27831176, 0.014024..."
1,124416,120345,0,120345,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371...","[-0.464551, -0.9952115, 0.29176545, -0.0407484..."
2,76535,134728,4,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,...","[-0.94547033, -0.6833449, -0.2319088, -0.12324..."
3,255338,134728,5,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,...","[-0.94547033, -0.6833449, -0.2319088, -0.12324..."
4,136726,197160,5,197160,"[30-minutes-or-less, time-to-make, course, mai...","[[-0.008353826, -2.0104654, -1.60594, 2.132089...","[-0.43313128, -1.2558981, -0.39539507, 0.08085..."


In [33]:
test_array = df_full['doc_vec'].to_list()
Xnew = np.array(test_array).reshape((1,-1))
Xnew = np.zeros(len(df_full))
rg = XGBRegressor()
y = df_full['rating'].to_list()
Ynew = np.array(y).reshape((1,-1))
mdl = rg.fit(Xnew, Ynew)

  


ValueError: Please reshape the input data into 2-dimensional matrix.

In [51]:
df_full_ = df_full.drop(labels=10412, axis=0)
np.vstack(df_full_['doc_vec'])

array([[-0.1545624 , -0.92504364,  0.27831176, ..., -0.08729316,
         0.48941162, -0.05660954],
       [-0.464551  , -0.99521148,  0.29176545, ...,  0.4024446 ,
        -0.09688257,  0.46474418],
       [-0.94547033, -0.6833449 , -0.2319088 , ...,  0.36478165,
        -0.68412608,  0.11156012],
       ...,
       [ 0.66840112, -0.6241017 , -0.10890383, ..., -0.54412818,
         0.60159576, -0.1591617 ],
       [-0.37391147, -0.85031307,  0.00650825, ...,  0.71367639,
        -0.02232931,  0.44661844],
       [-0.37391147, -0.85031307,  0.00650825, ...,  0.71367639,
        -0.02232931,  0.44661844]])

In [55]:
train, test = train_test_split(df_full_, test_size=0.2)
rg.fit(np.vstack(train['doc_vec']),train.rating)
predicted = rg.predict(np.vstack(test['doc_vec']))
print(predicted)

[4.530293  4.6345816 4.5018263 ... 4.648133  4.5150285 4.43743  ]


In [56]:
df_full_

Unnamed: 0,user_id,recipe_id,rating,id,tags_normalized,tags_vectorized,doc_vec
0,57222,85009,5,85009,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371...","[-0.1545624, -0.92504364, 0.27831176, 0.014024..."
1,124416,120345,0,120345,"[15-minutes-or-less, time-to-make, course, mai...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371...","[-0.464551, -0.9952115, 0.29176545, -0.0407484..."
2,76535,134728,4,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,...","[-0.94547033, -0.6833449, -0.2319088, -0.12324..."
3,255338,134728,5,134728,"[60-minutes-or-less, time-to-make, main-ingred...","[[1.0269203, -1.972872, -2.1189754, 2.0314367,...","[-0.94547033, -0.6833449, -0.2319088, -0.12324..."
4,136726,197160,5,197160,"[30-minutes-or-less, time-to-make, course, mai...","[[-0.008353826, -2.0104654, -1.60594, 2.132089...","[-0.43313128, -1.2558981, -0.39539507, 0.08085..."
...,...,...,...,...,...,...,...
237995,594923,249924,5,249924,"[time-to-make, course, main-ingredient, cuisin...","[[0.50824, -2.4250808, -1.4971544, 2.0699348, ...","[-0.27955064, -0.9678705, -0.084901325, -0.056..."
237996,1269180,257796,4,257796,"[main-ingredient, preparation, occasion, poult...","[[0.37609527, -2.531526, 0.5628416, -0.4621942...","[0.013922434, -0.658329, 0.20771731, -0.253216..."
237997,126435,166739,5,166739,"[30-minutes-or-less, time-to-make, course, pre...","[[-0.008353826, -2.0104654, -1.60594, 2.132089...","[0.6684011, -0.6241017, -0.108903825, -0.86413..."
237998,199020,82303,5,82303,"[15-minutes-or-less, time-to-make, course, pre...","[[-0.18661752, -2.6211035, 0.21846467, 2.33371...","[-0.37391147, -0.85031307, 0.006508245, 0.0760..."
