In [34]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.svm import SVR

In [2]:
raw_data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

Transform this regression problem into a binary classifier and clean up the feature set. 

In [3]:
raw_data.describe()

Unnamed: 0,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
count,20052.0,15935.0,15890.0,15869.0,15933.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,3.714467,6322.958,100.160793,346.8775,6225.975,0.000299,5e-05,0.000848,0.001346,0.000349,...,0.001247,0.026332,5e-05,0.000299,0.014861,0.00015,0.000349,0.001396,0.000948,0.022741
std,1.340829,359046.0,3840.318527,20456.11,333318.2,0.017296,0.007062,0.029105,0.036671,0.018681,...,0.035288,0.160123,0.007062,0.017296,0.121001,0.012231,0.018681,0.037343,0.030768,0.14908
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.75,198.0,3.0,7.0,80.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.375,331.0,8.0,17.0,294.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.375,586.0,27.0,33.0,711.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,30111220.0,236489.0,1722763.0,27675110.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
non_numeric_columns = raw_data.select_dtypes(['object']).columns
print(non_numeric_columns)
numeric_columns = raw_data.select_dtypes(['int64', 'float64']).columns
print(numeric_columns)

Index(['title'], dtype='object')
Index(['rating', 'calories', 'protein', 'fat', 'sodium', '#cakeweek',
       '#wasteless', '22-minute meals', '3-ingredient recipes',
       '30 days of groceries',
       ...
       'yellow squash', 'yogurt', 'yonkers', 'yuca', 'zucchini', 'cookbooks',
       'leftovers', 'snack', 'snack week', 'turkey'],
      dtype='object', length=679)


In [8]:
for col in numeric_columns:
    q75, q25 = np.percentile(raw_data[col], [75 ,25])
    iqr = q75 - q25

    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)

    print("The number of outliers for {} is: {}.".format(col, len((np.where((raw_data[col] > max_val) 
                      | (raw_data[col] < min_val))[0]))))

The number of outliers for rating is: 2656.
The number of outliers for calories is: 0.
The number of outliers for protein is: 0.
The number of outliers for fat is: 0.
The number of outliers for sodium is: 0.
The number of outliers for #cakeweek is: 6.
The number of outliers for #wasteless is: 1.
The number of outliers for 22-minute meals is: 17.
The number of outliers for 3-ingredient recipes is: 27.
The number of outliers for 30 days of groceries is: 7.
The number of outliers for advance prep required is: 109.
The number of outliers for alabama is: 3.
The number of outliers for alaska is: 2.
The number of outliers for alcoholic is: 835.
The number of outliers for almond is: 584.
The number of outliers for amaretto is: 39.
The number of outliers for anchovy is: 45.
The number of outliers for anise is: 116.
The number of outliers for anniversary is: 111.
The number of outliers for anthony bourdain is: 2.
The number of outliers for aperitif is: 12.
The number of outliers for appetizer is

  interpolation=interpolation)



The number of outliers for cook like a diner is: 4.
The number of outliers for cookbook critic is: 3.
The number of outliers for cookie is: 69.
The number of outliers for cookies is: 273.
The number of outliers for coriander is: 131.
The number of outliers for corn is: 371.
The number of outliers for cornmeal is: 271.
The number of outliers for costa mesa is: 1.
The number of outliers for cottage cheese is: 37.
The number of outliers for couscous is: 93.
The number of outliers for crab is: 156.
The number of outliers for cranberry is: 370.
The number of outliers for cranberry sauce is: 4.
The number of outliers for cream cheese is: 375.
The number of outliers for créme de cacao is: 21.
The number of outliers for crêpe is: 1.
The number of outliers for cr��me de cacao is: 1.
The number of outliers for cuba is: 3.
The number of outliers for cucumber is: 379.
The number of outliers for cumin is: 93.
The number of outliers for cupcake is: 3.
The number of outliers for currant is: 108.
The

Far too many of the explanatory variables have outliers to winsorize them individually. Is there a way to winsorize all at once? 

Just drop the columns with missing values for this challenge

Get 30 most important features:

In [21]:
X = raw_data.drop(['rating', 'calories', 'protein', 'fat', 'sodium'], 1).sample(frac=0.3, replace=True, random_state=1)
Y = raw_data.rating.sample(frac=0.3, replace=True, random_state=1)

In [24]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2
#selectKbest to choose most important features
best_features = SelectKBest(score_func=f_classif, k=30)
fit = best_features.fit(X,Y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

# concatenate 
feature_scores = pd.concat([df_columns, df_scores],axis=1)
feature_scores.columns = ['Feature Name','Score'] 
print(feature_scores.nlargest(30,'Score'))

             Feature Name      Score
186                 drink  91.662159
8               alcoholic  76.595048
277        house & garden  53.692140
235                   gin  51.214793
133              cocktail  28.147424
580                spirit  27.728376
134        cocktail party  25.829784
195                 egypt  23.694640
50                bitters  22.719865
262         harpercollins  21.393078
333               liqueur  18.706376
32                   bake  16.359707
57            bon appétit  16.166810
544                 sauté  14.243621
240               gourmet  13.288225
542                 sauce  12.816267
65            breadcrumbs  11.340736
526                   rum  11.288065
152              cornmeal  10.515694
92                campari   9.892434
442                 party   9.816259
407         non-alcoholic   9.249772
160        créme de cacao   9.206547
119          chile pepper   9.203508
502          quick & easy   8.996261
577        sparkling wine   8.992224
3

 234 265 269 278 279 280 283 285 288 290 294 295 302 308 319 335 336 338
 349 351 372 373 381 386 389 394 397 400 404 420 428 431 461 479 489 495
 506 536 541 571 576 585 591 599 612 614 638 643 652 653 659 666 669 670] are constant.
  f = msb / msw


Above lists the 30 most important features. Transform into a binary classifer and train the model

In [25]:
#rating needs to be binary classifer (according to Challenge guidelines). I'll just do above average or not above average

raw_data['rating'].describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [26]:
raw_data['above_mean'] = np.where(raw_data['rating'] >= 3.714467, 1, 0)

In [27]:
raw_data.head()

Unnamed: 0,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,...,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey,above_mean
0,2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,5.0,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [30]:
X = raw_data[['drink','alcoholic' ,'house & garden','gin' ,'cocktail','spirit' ,'cocktail party' ,'egypt','bitters' , 'harpercollins' ,'liqueur' , 'bake'  ,'bon appétit' ,'sauté' ,'gourmet'  ,'sauce' ,'breadcrumbs' ,        'rum' , 'cornmeal' , 'campari' ,'party' , 'non-alcoholic' ,  'créme de cacao' ,'chile pepper' , 'quick & easy' ,  'sparkling wine' , '3-ingredient recipes', 'pasta'
  ,    'philippines', 'fruit juice']]   

In [31]:
Y= raw_data['above_mean']

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [37]:
svc = SVC()
svc.fit(X,Y)
svc.score(X, Y)
cross_val_score(svc, X, Y, cv=5)



array([0.80254301, 0.80279232, 0.80374065, 0.80723192, 0.80598504])

Ok these scores are MUCH better than the ones in the checkpoint. Clearly this works better as a classifer than a regression.  
I'm sure the scores could be improved even more if I removed some outliers, addressing the missing values in the nutrition columns and included those variables, created 3 categories for rating (high, low, medium) and performed a multiclass classifer, etc. 