<a href="https://colab.research.google.com/github/arewelearningyet/DS-Unit-2-Linear-Models/blob/master/module4-logistic-regression/LS_DS_214_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lambda School Data Science

*Unit 2, Sprint 1, Module 4*

---

# Logistic Regression


## Assignment 🌯

You'll use a [**dataset of 400+ burrito reviews**](https://srcole.github.io/100burritos/). How accurately can you predict whether a burrito is rated 'Great'?

> We have developed a 10-dimensional system for rating the burritos in San Diego. ... Generate models for what makes a burrito great and investigate correlations in its dimensions.

- [ ] Do train/validate/test split. Train on reviews from 2016 & earlier. Validate on 2017. Test on 2018 & later.
- [ ] Begin with baselines for classification.
- [ ] Use scikit-learn for logistic regression.
- [ ] Get your model's validation accuracy. (Multiple times if you try multiple iterations.)
- [ ] Get your model's test accuracy. (One time, at the end.)
- [ ] Commit your notebook to your fork of the GitHub repo.


## Stretch Goals

- [ ] Add your own stretch goal(s) !
- [ ] Make exploratory visualizations.
- [ ] Do one-hot encoding.
- [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
- [ ] Get and plot your coefficients.
- [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

In [0]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [0]:
# Load data downloaded from https://srcole.github.io/100burritos/
import pandas as pd
df = pd.read_csv(DATA_PATH+'burritos/burritos.csv')

In [0]:
# Derive binary classification target:
# We define a 'Great' burrito as having an
# overall rating of 4 or higher, on a 5 point scale.
# Drop unrated burritos.
df = df.dropna(subset=['overall'])
df['Great'] = df['overall'] >= 4

In [0]:
# Clean/combine the Burrito categories
df['Burrito'] = df['Burrito'].str.lower()

california = df['Burrito'].str.contains('california')
asada = df['Burrito'].str.contains('asada')
surf = df['Burrito'].str.contains('surf')
carnitas = df['Burrito'].str.contains('carnitas')

df.loc[california, 'Burrito'] = 'California'
df.loc[asada, 'Burrito'] = 'Asada'
df.loc[surf, 'Burrito'] = 'Surf & Turf'
df.loc[carnitas, 'Burrito'] = 'Carnitas'
df.loc[~california & ~asada & ~surf & ~carnitas, 'Burrito'] = 'Other'

In [0]:
# Drop some high cardinality categoricals
df = df.drop(columns=['Notes', 'Location', 'Reviewer', 'Address', 'URL', 'Neighborhood'])

In [0]:
# Drop some columns to prevent "leakage"
df = df.drop(columns=['Rec', 'overall'])

In [7]:
print(df.shape)
df.head()

(421, 59)


Unnamed: 0,Burrito,Date,Yelp,Google,Chips,Cost,Hunger,Mass (g),Density (g/mL),Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Unreliable,NonSD,Beef,Pico,Guac,Cheese,Fries,Sour cream,Pork,Chicken,Shrimp,Fish,Rice,Beans,Lettuce,Tomato,Bell peper,Carrots,Cabbage,Sauce,Salsa.1,Cilantro,Onion,Taquito,Pineapple,Ham,Chile relleno,Nopales,Lobster,Queso,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini,Great
0,California,1/18/2016,3.5,4.2,,6.49,3.0,,,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
1,California,1/24/2016,3.5,3.3,,5.45,3.5,,,,,,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
2,Carnitas,1/24/2016,,,,4.85,1.5,,,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,,,,x,x,,,,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
3,Asada,1/24/2016,,,,5.25,2.0,,,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,,,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
4,California,1/27/2016,4.0,3.8,x,6.59,4.0,,,,,,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,,,x,x,,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True


In [0]:
#  Do train/validate/test split. 
df['Date']=df.Date.astype('M')
# Train on reviews from 2016 & earlier. 
train=df.loc[(df.Date.dt.year<=2016)]
# Validate on 2017. 
val=df.loc[df.Date.dt.year==2017]
# Test on 2018 & later.
test=df.loc[(df.Date.dt.year>=2018)]

In [9]:
train.describe()

Unnamed: 0,Yelp,Google,Cost,Hunger,Mass (g),Density (g/mL),Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Queso
count,71.0,71.0,292.0,297.0,0.0,0.0,175.0,174.0,174.0,298.0,283.0,288.0,297.0,292.0,296.0,278.0,296.0,296.0,0.0
mean,3.897183,4.142254,6.896781,3.445286,,,19.829886,22.042241,0.77092,3.472315,3.70636,3.551215,3.519024,3.52887,3.395946,3.32464,3.540203,3.955068,
std,0.47868,0.371738,1.211412,0.85215,,,2.081275,1.685043,0.137833,0.797606,0.991897,0.869483,0.850348,1.040457,1.089044,0.971226,0.922426,1.167341,
min,2.5,2.9,2.99,0.5,,,15.0,17.0,0.4,1.4,1.0,1.0,1.0,0.5,1.0,0.0,1.0,0.0,
25%,3.5,4.0,6.25,3.0,,,18.5,21.0,0.6625,3.0,3.0,3.0,3.0,3.0,2.5,2.5,3.0,3.5,
50%,4.0,4.2,6.85,3.5,,,19.5,22.0,0.75,3.5,4.0,3.5,3.5,4.0,3.5,3.5,3.75,4.0,
75%,4.0,4.4,7.5,4.0,,,21.0,23.0,0.87,4.0,4.5,4.0,4.0,4.0,4.0,4.0,4.0,5.0,
max,4.5,4.9,11.95,5.0,,,26.0,27.0,1.24,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,


In [10]:
train.shape, val.shape, test.shape

((298, 59), (85, 59), (38, 59))

In [0]:
test=test.fillna(value=0, axis='columns')

In [18]:
xtest

Unnamed: 0,Cost,Hunger,Meat,Fillings,Synergy,Google,Salsa,Volume,Tortilla,Uniformity
77,8.0,4.0,5.0,5.0,5.0,,3.0,,4.5,5.0
386,7.25,4.0,4.0,5.0,4.0,,3.0,,4.0,3.0
387,4.19,3.0,2.0,2.0,3.0,3.8,4.0,,3.0,1.0
388,7.0,5.0,5.0,5.0,5.0,4.3,4.0,,5.0,5.0
389,8.5,4.0,3.0,3.5,3.0,,3.0,0.92,4.0,2.0
390,7.2,3.0,4.0,3.0,3.0,,4.0,1.0,4.0,3.0
391,5.99,3.0,4.3,3.5,3.8,,3.0,0.84,3.5,4.0
392,5.99,3.5,5.0,4.5,4.5,,4.0,0.8,4.0,4.0
393,5.99,2.0,4.5,4.0,4.0,,3.0,0.91,2.0,2.0
394,8.99,4.0,4.0,4.0,4.0,,3.5,1.05,4.5,4.0


In [11]:
#  Begin with baselines for classification.
# define target variable
target='Great'
# define target train set
ytrain=train[target]
# look at majority class of train set
train[target].value_counts(normalize=True)

False    0.590604
True     0.409396
Name: Great, dtype: float64

In [12]:
# define the majority class
majorityclass=ytrain.mode()[0]
# run a preliminary prediction based on majority class
ypred=[majorityclass]*len(ytrain)
# import accuracy metric function
from sklearn.metrics import accuracy_score
# measure accuracy of preliminary prediction based on majority class
accuracy_score(ytrain, ypred)

0.5906040268456376

In [13]:
# define target validation set
yval=val[target]
# update preliminary prediction with target of validation set
ypred=[majorityclass] *len(yval)
# measure accuracy of majority class baseline by validation set standards
accuracy_score(yval,ypred)

0.5529411764705883

In [0]:
features = ['Cost', 'Hunger', 'Meat', 'Fillings', 'Synergy', 'Google', 'Salsa', 'Volume', 'Tortilla', 'Uniformity']
xtrain=train[features]
xval=val[features]

In [15]:
#  Use scikit-learn for logistic regression.
'''     STRETCH
   Do feature scaling.
'''
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
# instantiate imputer
imputer=SimpleImputer()
# apply to train and validation feature sets
xtrainimp=imputer.fit_transform(xtrain)
xvalimp=imputer.transform(xval)
# instantiate scaler
scaler=StandardScaler()
# apply to imputed train and validation feature sets
xtrainscaled=scaler.fit_transform(xtrainimp)
xvalscaled=scaler.transform(xvalimp)
# instantiate logistic regression 
model=LogisticRegressionCV()
# fit to scaled training feature set, training target set
model.fit(xtrainscaled, ytrain)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [16]:
#  Get your model's validation accuracy. (Multiple times if you try multiple iterations.)
print(f'Validation Accuracy: {model.score(xvalscaled, yval)}')

Validation Accuracy: 0.8588235294117647


In [27]:
#  Get your model's test accuracy. (One time, at the end.)
xtest=test[features]
ytest=test[target]
print(f'Test Accuracy: {model.score(xtest, ytest)}')

Test Accuracy: 0.5789473684210527


In [0]:
'''     STRETCH
    Get and plot your coefficients.
'''
coefficients=pd.Series(model.coef_[0], xtrain.columns)
coefficients.sort_values().plot.barh();

In [0]:
'''    STRETCH
   Do one-hot encoding.
'''
exclude=['Unreliable', 'Cost', 'Carrots', 'Ham', 'Nopales', 'Lobster', 'Egg', 'Mushroom', 'Bacon', 'Sushi', 'Avocado', 'Zucchini', 'Burrito', 'Chips', 'Yelp', 'Google', 'Hunger', 'Mass (g)', 'Density (g/mL)', 'Length', 'Circum', 'Volume', 'Tortilla', 'Temp', 'Meat', 'Fillings', 'Meat:filling', 'Uniformity', 'Salsa', 'Synergy', 'Wrap', 'Queso']
mod=df.drop(exclude, axis=1)
mod=mod.fillna(value=0, axis='columns')
mod.replace('X', 'x', inplace=True)
mod.replace('x', 1, inplace=True)
# Train on reviews from 2016 & earlier. 
train=mod.loc[(mod.Date.dt.year<=2016)]
# Validate on 2017. 
val=mod.loc[mod.Date.dt.year==2017]
# Test on 2018 & later.
test=mod.loc[(mod.Date.dt.year>=2018)]
X_train = train.drop(['Date', 'Great'], axis=1)
y_train = train[target]
X_val = val.drop(['Date', 'Great'], axis=1)
y_val = val[target]
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [0]:
encoder=ce.OneHotEncoder(use_cat_names=True)
X_train_encoded=encoder.fit_transform(X_train)
X_val_encoded=encoder.transform(X_val)

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.transform(X_val_encoded)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

model = LogisticRegressionCV()
model.fit(X_train_scaled, y_train)
print('Validation Accuracy', model.score(X_val_scaled, y_val))

In [0]:
X_test=test.drop(['Date', 'Great'], axis=1)
y_test=test[target]
print(f'Test Accuracy: {model.score(X_test, y_test)}')

In [0]:
#  Add your own stretch goal(s) !

#  Make exploratory visualizations.

#  Try scikit-learn pipelines.
