In [3]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

***Overfitting the model*** 

To Solve: 
1) Cleaning-up feature sets - to reduce the noise 
2) Removing the nulls - could get some improvements 
3) Regression problem and turn it into a classifier


To include: 
1) Transform regression into binary classifier 
2) Whether or not to include nutritional information (30 most valuable features). 


Consider (critical thinking): 
1) Bias 
2) Anything that can make the dataset biased



There is. Several things in fact, but most glaringly is that we don't actually have a random sample. It could be, and probably is, that the people more likely to choose some kinds of recipes are more likely to give high reviews.



In [5]:
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.sum()

title                    Lentil, Apple, and Turkey Wrap Boudin Blanc Te...
rating                                                             74482.5
calories                                                       1.00756e+08
protein                                                        1.59156e+06
fat                                                             5.5046e+06
sodium                                                         9.91985e+07
#cakeweek                                                                6
#wasteless                                                               1
22-minute meals                                                         17
3-ingredient recipes                                                    27
30 days of groceries                                                     7
advance prep required                                                  109
alabama                                                                  3
alaska                   

In [8]:
# Clean up data and transform into a binary dataset 

X = df.drop(['rating', 'title', 'calories', 'protein', 'fat', 'sodium'], 1)

Y = np.where(df.rating <= .5,"LT_0.5",np.where(df.rating <= 1.5,"0.75_to_1.5",np.where(df.rating<=2.5,"1.75_to_2.5",np.where(df.rating <=3.5,"2.75_to_3.5",np.where(df.rating <=4.5,"3.75_to_4.5","4.75+")))))
df['bin_rating'] =  np.where(df.rating <= 4,0,1)
df1 = df.drop(['rating', 'title', 'calories', 'protein', 'fat', 'sodium'], 1)

In [9]:
# Define the training and test sizes.
trainsize = int(df1.shape[0] / 2)
df_test = df1.iloc[trainsize:, :].copy()
df_train = df1.iloc[:trainsize, :].copy()

y_train = df_train['bin_rating'].values.reshape(-1,1)
X_train = df_train.loc[:, ~(df_train.columns).isin(['bin_rating'])]

Y_test = df_test['bin_rating'].values.reshape(-1,1)
X_test = df_test.loc[:, ~(df_train.columns).isin(['bin_rating'])]

***Lasso Regression model***

Will reducing features make a difference in the model? 

In [10]:
from sklearn.linear_model import LogisticRegression
llr = LogisticRegression(penalty='l1',C=0.05) # I used 0.05 to get only 30 var w/coeff <> 0
llr.fit(X_train, y_train)
print(llr.score(X_train, y_train))
lasso_params = llr.coef_
print(lasso_params)
print('\nR-squared  - Lasso:')
print(llr.score(X_test, Y_test))

  y = column_or_1d(y, warn=True)


0.5810891681627768
[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.21646498  0.          0.          0.          0.          0.
   0.          0.          0.          0.08747397  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.34294538  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.0015

In [11]:
coefficients = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(llr.coef_))], axis = 1)
coefficients.columns = ['ind_var','coefficient']
new_df = coefficients[(coefficients['coefficient'] != 0)]

new_df.describe()

Unnamed: 0,coefficient
count,34.0
mean,0.019647
std,0.17598
min,-0.437622
25%,-0.032411
50%,0.025476
75%,0.113137
max,0.377952


In [12]:
print(new_df)


               ind_var  coefficient
30        backyard bbq     0.216465
39                beef     0.087474
57         bon appétit     0.342945
88                cake     0.001534
116            chicken    -0.139280
124          christmas     0.114946
134     cocktail party    -0.022516
152           cornmeal    -0.003594
171         dairy free    -0.035570
179             dinner     0.113745
186              drink    -0.437622
204               fall     0.053469
218     fourth of july     0.026651
228              fruit     0.030289
237        goat cheese     0.133085
240            gourmet     0.094283
251     grill/barbecue     0.255503
277     house & garden    -0.291333
306    kidney friendly     0.022690
321        leafy green     0.023227
343            low fat    -0.264240
446              pasta    -0.247085
453        peanut free     0.170635
457             pepper     0.040177
502       quick & easy    -0.022936
520              roast     0.249430
543            sausage     0

In [13]:
x_columns = new_df['ind_var']
y_column = ['bin_rating']

***KNN Modeling***

In [14]:
from sklearn.neighbors import KNeighborsClassifier
# Create the knn model. Look at the five closest neighbors.
knn = KNeighborsClassifier(n_neighbors=5,weights = 'distance')

In [15]:
# Fit the model on the training data.

knn.fit(X_train[x_columns], y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='distance')

In [16]:

# Make point predictions on the test set using the fit model.
predictions = knn.predict(X_test[x_columns])

# Get the actual values for the test set.
actual = Y_test

In [17]:
print(pd.crosstab(predictions.ravel(), actual.ravel()))


col_0     0     1
row_0            
0      2639  2407
1      2056  2924


In [18]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(knn,X_train[x_columns],y_train.ravel(), cv=5)
print(score)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

[0.55383848 0.555334   0.5521197  0.56109726 0.54091816]
Unweighted Accuracy: 0.55 (+/- 0.01)


***SVM classifier***

In [19]:
from sklearn.svm import SVC
# this takes a while
clf = SVC()
clf.fit(X_train[x_columns], y_train.ravel())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [20]:

# Make point predictions on the test set using the fit model.
predictions = clf.predict(X_test[x_columns])

# Get the actual values for the test set.
actual = Y_test

In [21]:
print(pd.crosstab(predictions.ravel(), actual.ravel()))


col_0     0     1
row_0            
0      1573  1083
1      3122  4248


In [22]:
score = cross_val_score(clf,X_train[x_columns],y_train.ravel(), cv=5)




In [23]:
print(score)


[0.57876371 0.5777667  0.59052369 0.59201995 0.5743513 ]


In [24]:
print("SVM Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))


SVM Accuracy: 0.58 (+/- 0.01)


***Very little improvement***

In [25]:
clf.fit(X_train, y_train.ravel())




SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [26]:

# Make point predictions on the test set using the fit model.
predictions = clf.predict(X_test)

# Get the actual values for the test set.
actual = Y_test

In [27]:
print(pd.crosstab(predictions.ravel(), actual.ravel()))


col_0     0     1
row_0            
0       303   152
1      4392  5179


In [28]:
score = cross_val_score(clf,X_train,y_train.ravel(), cv=5)





***Conclusion***


Our Lasso regression model gave a R-squared (0.5839816477159385).

**KNN model: **
    crosstabs
    
col_0     0     1
row_0            
0       303   152
1      4392  5179

[0.55383848 0.555334   0.5521197  0.56109726 0.54091816]
Unweighted Accuracy: 0.55 (+/- 0.01)

**SVM classifier:**
col_0     0     1
row_0            
0      1573  1083
1      3122  4248

[0.57876371 0.5777667  0.59052369 0.59201995 0.5743513 ]
SVM Accuracy: 0.58 (+/- 0.01)


***Overall***

There is very little improvement in the evaluators (accuracy with 0.55 and 0.58) with a R-squared value of 0.58. The data is imbalanced and likely has bias within how the data was collected. More additional data sets should be evaluated relating to this dataset. 