<a href="https://colab.research.google.com/github/Benjamin1118/DS-Unit-2-Linear-Models/blob/master/module_4_Benjamin_Bishop_assignment_regression_classification_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lambda School Data Science

*Unit 2, Sprint 1, Module 4*

---

# Logistic Regression


## Assignment 🌯

You'll use a [**dataset of 400+ burrito reviews**](https://srcole.github.io/100burritos/). How accurately can you predict whether a burrito is rated 'Great'?

> We have developed a 10-dimensional system for rating the burritos in San Diego. ... Generate models for what makes a burrito great and investigate correlations in its dimensions.

- [ ] Do train/validate/test split. Train on reviews from 2016 & earlier. Validate on 2017. Test on 2018 & later.
- [ ] Begin with baselines for classification.
- [ ] Use scikit-learn for logistic regression.
- [ ] Get your model's validation accuracy. (Multiple times if you try multiple iterations.)
- [ ] Get your model's test accuracy. (One time, at the end.)
- [ ] Commit your notebook to your fork of the GitHub repo.


## Stretch Goals

- [ ] Add your own stretch goal(s) !
- [ ] Make exploratory visualizations.
- [ ] Do one-hot encoding.
- [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
- [ ] Get and plot your coefficients.
- [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

In [0]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [0]:
# Load data downloaded from https://srcole.github.io/100burritos/
import pandas as pd
df = pd.read_csv(DATA_PATH+'burritos/burritos.csv')

In [0]:
# Derive binary classification target:
# We define a 'Great' burrito as having an
# overall rating of 4 or higher, on a 5 point scale.
# Drop unrated burritos.
df = df.dropna(subset=['overall'])
df['Great'] = df['overall'] >= 4

In [0]:
# Clean/combine the Burrito categories
df['Burrito'] = df['Burrito'].str.lower()

california = df['Burrito'].str.contains('california')
asada = df['Burrito'].str.contains('asada')
surf = df['Burrito'].str.contains('surf')
carnitas = df['Burrito'].str.contains('carnitas')

df.loc[california, 'Burrito'] = 'California'
df.loc[asada, 'Burrito'] = 'Asada'
df.loc[surf, 'Burrito'] = 'Surf & Turf'
df.loc[carnitas, 'Burrito'] = 'Carnitas'
df.loc[~california & ~asada & ~surf & ~carnitas, 'Burrito'] = 'Other'

In [0]:
# Drop some high cardinality categoricals
df = df.drop(columns=['Notes', 'Location', 'Reviewer', 'Address', 'URL', 'Neighborhood'])

In [0]:
# Drop some columns to prevent "leakage"
df = df.drop(columns=['Rec', 'overall'])

In [0]:
#drop columns with nan values ie Mass, Density, and Queso
df=df.drop(columns=['Mass (g)', 'Density (g/mL)', 'Queso'], axis=1)


In [105]:
df.head()

Unnamed: 0,Burrito,Date,Yelp,Google,Chips,Cost,Hunger,Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Unreliable,NonSD,Beef,Pico,Guac,Cheese,Fries,Sour cream,Pork,Chicken,Shrimp,Fish,Rice,Beans,Lettuce,Tomato,Bell peper,Carrots,Cabbage,Sauce,Salsa.1,Cilantro,Onion,Taquito,Pineapple,Ham,Chile relleno,Nopales,Lobster,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini,Great
0,California,1/18/2016,3.5,4.2,,6.49,3.0,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
1,California,1/24/2016,3.5,3.3,,5.45,3.5,,,,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
2,Carnitas,1/24/2016,,,,4.85,1.5,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,,,,x,x,,,,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
3,Asada,1/24/2016,,,,5.25,2.0,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,,,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
4,California,1/27/2016,4.0,3.8,x,6.59,4.0,,,,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,,,x,x,,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True


In [106]:
df.shape

(421, 56)

In [0]:
#Setting up train validate and test data.
train = df[df['Date'].str.contains('2016')]
val = df[df['Date'].str.contains('2017')]
test = train = df[df['Date'].str.contains('2018')]

In [0]:
# find baseline mean for train.
target= 'Cost'
y_train = train[target]
y_val = val[target]
y_test = test[target]

In [109]:
#set up model
#import estimator class
from sklearn.linear_model import LinearRegression

#instanciate
lin_reg = LinearRegression()

#arrange x features matricies
features = ['Meat', 'Uniformity', 'Temp']
X_train = train[features]
X_val = val[features]

#impute missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

#fit models
lin_reg.fit(X_train_imputed, y_train)

#Apply models to new data
lin_reg.predict(X_val_imputed)

array([7.80373224, 8.43761231, 7.34552563, 7.74144938, 7.58273026,
       7.78207261, 6.77574907, 6.9366061 , 5.94029883, 6.78532478,
       7.02516399, 8.01032992, 7.00322089, 7.98220045, 6.97587691,
       7.10612698, 7.87169946, 6.86939364, 7.38941183, 6.68612222,
       7.3411937 , 8.30689402, 7.99626519, 7.89562889, 8.10676618,
       7.58953967, 7.45169469, 7.14106586, 7.81701149, 7.0863779 ,
       7.38153347, 7.74144938, 7.27382415, 7.3411937 , 7.12436655,
       6.8927199 , 7.32342543, 7.37720154, 7.89184011, 8.35263466,
       6.8949897 , 7.30625482, 6.8707768 , 7.06550375, 7.02516399,
       7.14106586, 7.24475745, 8.70034029, 7.74144938, 8.17551888,
       7.14106586, 7.49310341, 7.40574637, 7.49537322, 7.90663834,
       8.05405903, 8.38245613, 8.88640339, 6.74081018, 7.92176896,
       8.85146451, 7.8265872 , 8.56249531, 7.58500006, 7.38260242,
       7.48607105, 7.89335909, 7.0863779 , 7.21662797, 8.24539664,
       7.43104556, 7.14555489, 6.95500276, 7.56582335, 8.37564

In [110]:
# Get coef
pd.Series(lin_reg.coef_, features)

Meat         -0.579509
Uniformity    0.400256
Temp         -0.275690
dtype: float64

In [0]:
#logistic Regression
from sklearn.linear_model import LogisticRegressionCV
import category_encoders as ce 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [112]:
df.head()

Unnamed: 0,Burrito,Date,Yelp,Google,Chips,Cost,Hunger,Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Unreliable,NonSD,Beef,Pico,Guac,Cheese,Fries,Sour cream,Pork,Chicken,Shrimp,Fish,Rice,Beans,Lettuce,Tomato,Bell peper,Carrots,Cabbage,Sauce,Salsa.1,Cilantro,Onion,Taquito,Pineapple,Ham,Chile relleno,Nopales,Lobster,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini,Great
0,California,1/18/2016,3.5,4.2,,6.49,3.0,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
1,California,1/24/2016,3.5,3.3,,5.45,3.5,,,,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
2,Carnitas,1/24/2016,,,,4.85,1.5,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,,,,x,x,,,,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
3,Asada,1/24/2016,,,,5.25,2.0,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,,,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
4,California,1/27/2016,4.0,3.8,x,6.59,4.0,,,,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,,,x,x,,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True


In [0]:
features = ['Meat', 'Temp', 'Uniformity', 'Salsa']
target = 'Cost'
X_train = train[features]
X_val = val[features]
y_train = train[target]
y_val = val[target]



In [114]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((27, 4), (27,), (85, 4), (85,))

In [0]:
encoder =ce.OneHotEncoder(use_cat_names = True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.fit_transform(X_val)

In [116]:
X_train_encoded.head()

Unnamed: 0,Meat,Temp,Uniformity,Salsa
386,4.0,5.0,3.0,3.0
387,2.0,5.0,1.0,4.0
388,5.0,5.0,5.0,4.0
389,3.0,4.0,2.0,3.0
390,4.0,5.0,3.0,4.0


In [117]:
X_val_encoded.head()

Unnamed: 0,Meat,Temp,Uniformity,Salsa
301,4.0,4.5,5.0,1.5
302,,2.0,4.6,4.2
303,4.1,4.5,4.0,4.3
304,4.0,4.0,4.5,4.0
305,3.0,4.5,3.0,2.0


In [0]:
imputer= SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.fit_transform(X_val_encoded)

In [0]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.fit_transform(X_train_imputed)


In [0]:
model = LogisticRegressionCV(cv=5, n_jobs=-1, random_state=42)
model.fit(X_train_scaled, y_train)
print('Validation Accuracy' , model.score(X_val_scaled, y_val))

In [0]:
%matplotlib inline
coef= pd.Series(model.coef_[0], X_train_encoded.columns)
coef.sort_values().plot.barh();