In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('winemag-data-130k-v2 2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
len(df)

129971

In [4]:
df.variety.describe()

count         129970
unique           707
top       Pinot Noir
freq           13272
Name: variety, dtype: object

In [5]:
nullvals = df.variety.isnull().values
len(nullvals)

129971

## Find out how many null values exist and get rid of the features with the most null vals

In [6]:

df.isna().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [7]:
# region_2 and twitter handle have the most null values so dropping the column
df_cleaned1 = df.drop(['Unnamed: 0','region_2','taster_twitter_handle'],axis=1)
df_cleaned1

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,Roger Voss,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,Anna Lee C. Iijima,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [8]:
df_cleaned2 = df_cleaned1.dropna()

In [9]:
len(df_cleaned2) 

54170

In [10]:
len(df) - len(df_cleaned2) #number of lost data points - lost over half our data points

75801

In [11]:
df_cleaned2

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam
10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
16,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,Michael Schachner,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque
17,Argentina,Raw black-cherry aromas are direct and simple ...,Winemaker Selection,87,13.0,Mendoza Province,Mendoza,Michael Schachner,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino
18,Spain,"Desiccated blackberry, leather, charred wood a...",Vendimia Seleccionada Finca Valdelayegua Singl...,87,28.0,Northern Spain,Ribera del Duero,Michael Schachner,Pradorey 2010 Vendimia Seleccionada Finca Vald...,Tempranillo Blend,Pradorey
20,US,Ripe aromas of dark berries mingle with ample ...,Vin de Maison,87,23.0,Virginia,Virginia,Alexander Peartree,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont


In [12]:
list(df_cleaned2.columns.values)

['country',
 'description',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'taster_name',
 'title',
 'variety',
 'winery']

In [13]:
#double check no null vals
df_cleaned2.isna().sum()

country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
taster_name    0
title          0
variety        0
winery         0
dtype: int64

In [14]:
rwo_df = pd.read_excel('Types of Wine - O, R, W.xlsx')
rwo_df.head()

Unnamed: 0,variety,color
0,Alvar Roxo,Orange
1,Ar110,Orange
2,Ar99,Orange
3,Barbarossa,Orange
4,Barbaroux,Orange


In [15]:
df_cleaned2.describe(include= 'all')

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
count,54170,54170,54170,54170.0,54170.0,54170,54170,54170,54170,54170,54170
unique,7,49537,23943,,,63,1004,17,48994,434,8583
top,US,"Stalky aromas suggest hay and green herbs, wit...",Reserve,,,California,Columbia Valley (WA),Roger Voss,Segura Viudas NV Extra Dry Sparkling (Cava),Pinot Noir,Columbia Crest
freq,25888,3,1159,,,13998,2488,9237,8,6736,157
mean,,,,88.96655,39.209341,,,,,,
std,,,,2.955188,36.154076,,,,,,
min,,,,80.0,4.0,,,,,,
25%,,,,87.0,20.0,,,,,,
50%,,,,89.0,30.0,,,,,,
75%,,,,91.0,48.0,,,,,,


In [16]:
df_cleaned3 = df_cleaned2.merge(rwo_df, on='variety', how='left')
df_cleaned3

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,White
1,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Red
2,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,Red
3,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,Red
4,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Roger Voss,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,Orange
5,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Red
6,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,Michael Schachner,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque,Red
7,Argentina,Raw black-cherry aromas are direct and simple ...,Winemaker Selection,87,13.0,Mendoza Province,Mendoza,Michael Schachner,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino,Red
8,Spain,"Desiccated blackberry, leather, charred wood a...",Vendimia Seleccionada Finca Valdelayegua Singl...,87,28.0,Northern Spain,Ribera del Duero,Michael Schachner,Pradorey 2010 Vendimia Seleccionada Finca Vald...,Tempranillo Blend,Pradorey,Red
9,US,Ripe aromas of dark berries mingle with ample ...,Vin de Maison,87,23.0,Virginia,Virginia,Alexander Peartree,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont,Red


In [17]:
df_cleaned3.isna().sum()

country           0
description       0
designation       0
points            0
price             0
province          0
region_1          0
taster_name       0
title             0
variety           0
winery            0
color          7828
dtype: int64

In [18]:
df_cleaned4 = df_cleaned3.dropna()
df_cleaned4.color.describe()

count     46348
unique        3
top         Red
freq      30919
Name: color, dtype: object

In [19]:
df_cleaned4.groupby('color').count()


Unnamed: 0_level_0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Orange,655,655,655,655,655,655,655,655,655,655,655
Red,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919
White,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774


In [20]:
#Dropping orange to do log reg
to_drop = ['Orange']
df_cleaned5 = df_cleaned4[~df_cleaned4['color'].isin(to_drop)]
df_cleaned5.groupby('color').count()

Unnamed: 0_level_0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Red,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919,30919
White,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774,14774


In [21]:
df_cleaned5.describe(include= 'all')

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color
count,45693,45693,45693,45693.0,45693.0,45693,45693,45693,45693,45693,45693,45693
unique,7,41833,20363,,,62,906,17,41334,263,7725,2
top,US,"Cigar box, café au lait, and dried tobacco aro...",Reserve,,,California,Columbia Valley (WA),Roger Voss,Segura Viudas NV Extra Dry Sparkling (Cava),Pinot Noir,Columbia Crest,Red
freq,23255,3,1095,,,12667,2240,7271,8,6736,146,30919
mean,,,,89.082266,41.385748,,,,,,,
std,,,,2.968545,37.391714,,,,,,,
min,,,,80.0,4.0,,,,,,,
25%,,,,87.0,20.0,,,,,,,
50%,,,,89.0,32.0,,,,,,,
75%,,,,91.0,50.0,,,,,,,


# Potential target variables to Predict based on data above:
- color: there are 3 unique colors
- country: there are 7 unique countries
- points - below mean or above mean?
- price - below mean or above mean?
- taster name - based on the language they used?
- the variety of wine? - pinot noir etc (there are 434 total wine varieties)

## 1. Logistic Regression - Using StatsModel

In [24]:
df_cleaned6= pd.get_dummies(df_cleaned5, columns=['color'],drop_first = True)

In [27]:
from patsy import dmatrices

y, X = dmatrices('color_White ~ country  + points + price + province + taster_name',
                  df_cleaned6, return_type = "dataframe")

In [28]:
#fit model
import statsmodels.api as sm
logit_model = sm.Logit(y, X)
result = logit_model.fit()

         Current function value: 0.476539
         Iterations: 35




In [29]:
result.summary()

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,color_White,No. Observations:,45693.0
Model:,Logit,Df Residuals:,45613.0
Method:,MLE,Df Model:,79.0
Date:,"Sun, 06 Jan 2019",Pseudo R-squ.:,0.2428
Time:,12:28:45,Log-Likelihood:,-21774.0
converged:,False,LL-Null:,-28757.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0459,9.42e+05,-4.88e-08,1.000,-1.85e+06,1.85e+06
country[T.Australia],-0.6545,3.52e+06,-1.86e-07,1.000,-6.89e+06,6.89e+06
country[T.Canada],0.2787,,,,,
country[T.France],0.9980,9.4e+05,1.06e-06,1.000,-1.84e+06,1.84e+06
country[T.Italy],-0.6054,,,,,
country[T.Spain],-0.2076,,,,,
country[T.US],2.7635,2.3e+06,1.2e-06,1.000,-4.5e+06,4.5e+06
province[T.Andalucia],0.0843,,,,,
province[T.Arizona],-2.2793,2.69e+06,-8.48e-07,1.000,-5.27e+06,5.27e+06


In [30]:
y, X = dmatrices('color_White ~ country  + points + price + taster_name',
                  df_cleaned6, return_type = "dataframe")
logit_model = sm.Logit(y, X)
result = logit_model.fit()
result.summary()

         Current function value: 0.548269
         Iterations: 35




0,1,2,3
Dep. Variable:,color_White,No. Observations:,45693.0
Model:,Logit,Df Residuals:,45668.0
Method:,MLE,Df Model:,24.0
Date:,"Sun, 06 Jan 2019",Pseudo R-squ.:,0.1288
Time:,12:28:46,Log-Likelihood:,-25052.0
converged:,False,LL-Null:,-28757.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.9475,0.537,3.629,0.000,0.896,2.999
country[T.Australia],-1.0408,0.342,-3.040,0.002,-1.712,-0.370
country[T.Canada],0.5174,0.383,1.352,0.176,-0.232,1.267
country[T.France],-0.5923,0.339,-1.749,0.080,-1.256,0.071
country[T.Italy],0.5839,0.302,1.932,0.053,-0.008,1.176
country[T.Spain],0.6920,0.067,10.276,0.000,0.560,0.824
country[T.US],-0.9021,0.346,-2.606,0.009,-1.581,-0.224
taster_name[T.Anna Lee C. Iijima],0.7012,0.145,4.841,0.000,0.417,0.985
taster_name[T.Anne Krebiehl MW],1.8736,0.271,6.905,0.000,1.342,2.405


In [31]:
y, X = dmatrices('color_White ~ country + price + taster_name',
                  df_cleaned6, return_type = "dataframe")
logit_model = sm.Logit(y, X)
result = logit_model.fit()
result.summary()

         Current function value: 0.548296
         Iterations: 35




0,1,2,3
Dep. Variable:,color_White,No. Observations:,45693.0
Model:,Logit,Df Residuals:,45669.0
Method:,MLE,Df Model:,23.0
Date:,"Sun, 06 Jan 2019",Pseudo R-squ.:,0.1288
Time:,12:28:47,Log-Likelihood:,-25053.0
converged:,False,LL-Null:,-28757.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3360,0.371,3.600,0.000,0.609,2.063
country[T.Australia],-1.0521,0.342,-3.072,0.002,-1.723,-0.381
country[T.Canada],0.5171,0.383,1.351,0.177,-0.233,1.267
country[T.France],-0.5985,0.339,-1.765,0.078,-1.263,0.066
country[T.Italy],0.5785,0.302,1.915,0.056,-0.014,1.171
country[T.Spain],0.6898,0.067,10.248,0.000,0.558,0.822
country[T.US],-0.8994,0.346,-2.597,0.009,-1.578,-0.221
taster_name[T.Anna Lee C. Iijima],0.6856,0.145,4.744,0.000,0.402,0.969
taster_name[T.Anne Krebiehl MW],1.8461,0.271,6.814,0.000,1.315,2.377


## 2. Logistic Regression - Using SciKit Learn

In [32]:
X = df_cleaned6[df_cleaned6.columns[:-1]]
y = df_cleaned6.color_White
X.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery
0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
3,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
5,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature


In [37]:
price = X['price']
for row in price:
    normalized_price = (price-price.min())/ (price.max()-price.min())

normalized_price.head()

0    0.004480
1    0.030363
2    0.005475
3    0.005973
5    0.007466
Name: price, dtype: float64

In [41]:
for col in (df_cleaned6[['price','points']]):
    df_cleaned6[col] = (df_cleaned6[col]-min(df_cleaned6[col]))/ (max(df_cleaned6[col]) - min(df_cleaned6[col]))
df_cleaned6.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,title,variety,winery,color_White
0,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,0.35,0.00448,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,1
1,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,0.35,0.030363,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,0
2,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,0.35,0.005475,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,0
3,Italy,"Here's a bright, informal red that opens with ...",Belsito,0.35,0.005973,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,0
5,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,0.35,0.007466,California,Napa Valley,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,0


In [51]:
X = df_cleaned6[['price','points']]
y = df_cleaned6.color_White
X.head()

Unnamed: 0,price,points
0,0.00448,0.35
1,0.030363,0.35
2,0.005475,0.35
3,0.005973,0.35
5,0.007466,0.35


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(fit_intercept = False, C = 1e12) #Starter code


In [54]:
model_log = logreg.fit(X_train, y_train)
model_log



LogisticRegression(C=1000000000000.0, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [55]:
y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

In [57]:
residuals = y_train - y_hat_train
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    23111
1    11158
Name: color_White, dtype: int64
0    0.6744
1    0.3256
Name: color_White, dtype: float64


### How many times was the classifier correct for the training set?  
23111 correct, 67.44% accuracy

In [58]:
residuals = y_test - y_hat_test
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))
#63 correct, 83% accuracy

0    7808
1    3616
Name: color_White, dtype: int64
0    0.683473
1    0.316527
Name: color_White, dtype: float64


### How many times was the classifier correct for the test set?
7808 correct, 68.3473% accuracy