# Imports

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold, cross_validate

import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)

In [11]:
df = pd.read_csv('./Data/tfidf_df.csv')

In [12]:
df.head()

Unnamed: 0,rest_cost,rest_name,rest_rating,address_only,rest_zip_code,11,115,11am,11pm,11th,...,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Wine Bar,rest_borough_Bronx,rest_borough_Brooklyn,rest_borough_Jersey City,rest_borough_Manhattan,rest_borough_Queens,rest_borough_Staten Island,rest_borough_Westchester
0,2.0,Mama’s Too,8.3,"2750 Broadway, New York, NY 10025",10025,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,4.0,Omakase Room By Tatsu,7.7,"14 Christopher St, New York, NY 10014",10014,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,4.0,Sushi Azabu,8.5,"428 Greenwich St., New York, NY 10013",10013,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,3.0,Saint Julivert Fisherie,7.7,"264 Clinton St, New York, NY 11201",11201,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,2.0,Farida,8.0,"498 9th Ave, New York, NY 10018",10018,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


## Set up variables

We are going to attempt to create a model that uses all the features to predict restaurant rating using the vectorized review, cuisine type, restaurant cost, and borough.

In [14]:
X = df.drop(['rest_name', 'rest_rating', 'address_only', 'rest_zip_code'], axis = 1)

In [15]:
y = df['rest_rating']

### Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Cross validate

In [22]:
cross_val_score(LinearRegression(), X_train, y_train, cv = 5)

array([0.20971155, 0.12986339, 0.25369291, 0.17441142, 0.20165996])

In [23]:
cross_val_score(RidgeCV(), X_train, y_train, cv = 5)

array([0.19944967, 0.13179645, 0.24872328, 0.1663748 , 0.19573784])

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
cross_val_score(RidgeCV(), X_train_sc, y_train, cv = 5).mean()

0.14643851927769536

In [28]:
cross_val_score(LassoCV(), X_train_sc, y_train, cv = 5).mean()



0.13295882514032123

### Fit and Evaluate Model

In [36]:
lr = LinearRegression()

model = lr.fit(X_train, y_train)

In [40]:
coefs = model.coef_

In [38]:
model.score(X_test, y_test)

0.21126579083437913

In [39]:
model.score(X_train, y_train)

1.0

In [44]:
coef_df = pd.DataFrame(coefs, columns = ['coefficients'])

In [47]:
coef_df['words'] = np.array(X.columns)

In [51]:
coef_df.sort_values('coefficients', ascending = False).head(20)

Unnamed: 0,coefficients,words
3842,1.766249,excellent
1145,1.422642,best
12143,1.372648,yes
3160,1.212234,dining
10051,1.206173,soba
462,1.193789,also
1353,1.180708,boon
3815,1.166788,every
4769,1.044397,gramercy
12155,1.044077,york


In [53]:
coef_df.sort_values('coefficients').head(20)

Unnamed: 0,coefficients,words
12130,-2.401047,yebisu
8249,-2.174347,porchetta
11476,-2.003477,unfortunately
903,-1.906108,bad
3421,-1.719054,dry
844,-1.670062,average
6452,-1.64608,macao
8085,-1.644338,pizz
10831,-1.57097,tartinery
7753,-1.567346,papatzul


# Try with the count vectorized dataframe

In [29]:
count = pd.read_csv('./Data/data_for_reccommender_1.csv')

In [30]:
count.head()

Unnamed: 0,rest_cost,rest_name,rest_rating,11,11th,12,13,14,14th,15,...,cuisine_type_Spanish,cuisine_type_Steaks,cuisine_type_Sushi,cuisine_type_Tacos,cuisine_type_Taiwanese,cuisine_type_Tex-Mex,cuisine_type_Thai,cuisine_type_Vegetarian,cuisine_type_Vietnamese,cuisine_type_Wine Bar
0,4.0,Omakase Room By Tatsu,7.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,Sushi Azabu,8.5,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3.0,Saint Julivert Fisherie,7.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,Farida,8.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,U-Gu,7.7,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Set up variables

In [31]:
X_count = count.drop(['rest_name', 'rest_rating'], axis = 1)

In [32]:
y_count = count['rest_rating']

### Train Test Split

In [33]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_count, y_count, random_state = 42)

### Cross Validate

In [34]:
cross_val_score(LinearRegression(), X_train_cv, y_train_cv, cv = 5)

array([ 0.23714746,  0.17496504, -0.00556331,  0.06512839,  0.18882754])