In [142]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


import matplotlib.pyplot as plt
%matplotlib inline

In [143]:
#Cargar train data
beer = pd.read_csv("./beer_train.csv",
                               index_col = 0)
print(beer.shape)

# Cargar test data
beertest = pd.read_csv("./beer_test.csv",
                               index_col = 0)
print(beertest.shape)

(24066, 16)
(10314, 15)


# Data explanation

#### __Numeric__:
1. __Size(L)__: Amount brewed for recipe listed
2. __OG__ : Specific gravity of wort before fermentation
3. __FG__ : Specific gravity of wort after fermentation
4. __ABV__: Alcohol By Volume
5. __IBU__: International Bittering Units
6. __Color__: Standard Reference Method. Light to dark. Ex. 40 = black
7. __BoilSize__: Fluid at beginning of boil
8. __BoilTime__: Time wort is boiled
9. __BoilGravity__: Specific gravity of wort before the boil
10. __Efficiency__: Beer mash extraction efficiency - extracting sugars from the grain during mash
11. __MashThickness___: Amount of water per pound of grain
12. __PrimaryTemp__: Temperature at the fermenting stage
13. __PitchRate__: Yeast added to the fermentor per gravity unit - M cells/ml/deg P

#### __Categoric__:
1. __SugarScale__: Scale to determine the concentration of dissolved solids in wort
2. __BrewMethod__: Various techniques for brewing
3. __Style__: Type of brew. VARIABLE TO PREDICT

In [144]:
beer.dtypes

Size(L)          float64
OG               float64
FG               float64
ABV              float64
IBU              float64
Color            float64
BoilSize         float64
BoilTime           int64
BoilGravity      float64
Efficiency       float64
MashThickness    float64
SugarScale        object
BrewMethod        object
PitchRate        float64
PrimaryTemp      float64
Style             object
dtype: object

In [145]:
beer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24066 entries, 0 to 24065
Data columns (total 16 columns):
Size(L)          24066 non-null float64
OG               24066 non-null float64
FG               24066 non-null float64
ABV              24066 non-null float64
IBU              24066 non-null float64
Color            24066 non-null float64
BoilSize         24066 non-null float64
BoilTime         24066 non-null int64
BoilGravity      23130 non-null float64
Efficiency       24066 non-null float64
MashThickness    14200 non-null float64
SugarScale       24066 non-null object
BrewMethod       24066 non-null object
PitchRate        10932 non-null float64
PrimaryTemp      16578 non-null float64
Style            24066 non-null object
dtypes: float64(12), int64(1), object(3)
memory usage: 3.1+ MB


In [146]:
# Percentage of missing values on training data
beer.isnull().sum()/len(beer)*100

Size(L)           0.000000
OG                0.000000
FG                0.000000
ABV               0.000000
IBU               0.000000
Color             0.000000
BoilSize          0.000000
BoilTime          0.000000
BoilGravity       3.889304
Efficiency        0.000000
MashThickness    40.995595
SugarScale        0.000000
BrewMethod        0.000000
PitchRate        54.574919
PrimaryTemp      31.114435
Style             0.000000
dtype: float64

In [136]:
# Percentage of missing values on testing data
beertest.isnull().sum()/len(beertest)*100

Size(L)           0.000000
OG                0.000000
FG                0.000000
ABV               0.000000
IBU               0.000000
Color             0.000000
BoilSize          0.000000
BoilTime          0.000000
BoilGravity       3.218926
Efficiency        0.000000
MashThickness    40.391701
SugarScale        0.000000
BrewMethod        0.000000
PitchRate        54.789606
PrimaryTemp      30.744619
dtype: float64

 #  Mods to datasets

 __MF OHE__

In [137]:
beer =  pd.get_dummies(beer,columns=['SugarScale', 'BrewMethod'])
beertest = pd.get_dummies(beertest,columns= ['SugarScale', 'BrewMethod'])


In [138]:
beer.head()

Unnamed: 0_level_0,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,MashThickness,PitchRate,PrimaryTemp,Style,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,22.71,1.071,1.021,6.51,81.14,6.43,28.39,90,1.057,78.0,1.2,,20.0,American IPA,0,1,1,0,0,0
1,7.57,1.062,1.016,6.14,138.44,15.28,7.57,60,1.062,35.0,,0.5,20.0,American IPA,0,1,0,0,0,1
2,12.0,1.051,1.012,5.17,27.57,14.54,14.0,60,1.041,65.0,,0.35,20.0,Irish Red Ale,0,1,0,1,0,0
3,15.0,1.051,1.013,5.01,38.53,26.73,19.0,90,1.04,65.0,3.0,,,American Stout,0,1,1,0,0,0
4,21.77,1.061,1.013,6.25,43.2,4.08,23.66,15,1.045,35.0,,0.75,20.0,American IPA,0,1,0,0,0,1


__Drop columns with too many missing data (+40%)__

In [139]:
beer.drop(columns=['MashThickness','PitchRate','SugarScale_Specific Gravity'], inplace=True)
beertest.drop(columns=['MashThickness','PitchRate','SugarScale_Specific Gravity'], inplace=True)

In [140]:
beer2 = beer.copy()
beertest2 = beertest.copy()

beertest2.head()

Unnamed: 0_level_0,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,PrimaryTemp,SugarScale_Plato,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,20.82,15.6465,4.23184,6.21,55.13,8.2,28.01,90,11.8,70.0,25.56,1,1,0,0,0
1,20.82,1.057,1.013,5.71,51.99,6.35,28.39,60,1.042,70.0,,0,1,0,0,0
2,22.71,1.058,1.014,5.67,54.77,24.75,28.39,60,1.046,75.0,20.0,0,1,0,0,0
3,10.0,1.052,1.01,5.51,93.58,5.46,28.5,60,1.018,75.0,17.0,0,1,0,0,0
4,20.0,12.0478,1.32023,5.7,33.72,3.58,30.0,60,8.1,75.0,27.0,1,1,0,0,0


 __Save target__

In [147]:
beertarget = beer2['Style']
beertarget.head()

Id
0      American IPA
1      American IPA
2     Irish Red Ale
3    American Stout
4      American IPA
Name: Style, dtype: object

__Fill missing data__

In [125]:
#Replace null values in TRAIN BoillGravity with mean per Style group  & drop initial BoilGravity
beer2 = beer.groupby(['Style']).transform(
    lambda x: x.fillna(x.mean()))
beer2.head()

Unnamed: 0_level_0,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,PrimaryTemp,SugarScale_Plato,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,22.71,1.071,1.021,6.51,81.14,6.43,28.39,90,1.057,78.0,20.0,0,1,0,0,0
1,7.57,1.062,1.016,6.14,138.44,15.28,7.57,60,1.062,35.0,20.0,0,0,0,0,1
2,12.0,1.051,1.012,5.17,27.57,14.54,14.0,60,1.041,65.0,20.0,0,0,1,0,0
3,15.0,1.051,1.013,5.01,38.53,26.73,19.0,90,1.04,65.0,19.538883,0,1,0,0,0
4,21.77,1.061,1.013,6.25,43.2,4.08,23.66,15,1.045,35.0,20.0,0,0,0,0,1


In [126]:
beertest2 = beertest2.fillna(beer2.mean())

beertest2.head()


Unnamed: 0_level_0,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,PrimaryTemp,SugarScale_Plato,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,20.82,15.6465,4.23184,6.21,55.13,8.2,28.01,90,11.8,70.0,25.56,1,1,0,0,0
1,20.82,1.057,1.013,5.71,51.99,6.35,28.39,60,1.042,70.0,19.733223,0,1,0,0,0
2,22.71,1.058,1.014,5.67,54.77,24.75,28.39,60,1.046,75.0,20.0,0,1,0,0,0
3,10.0,1.052,1.01,5.51,93.58,5.46,28.5,60,1.018,75.0,17.0,0,1,0,0,0
4,20.0,12.0478,1.32023,5.7,33.72,3.58,30.0,60,8.1,75.0,27.0,1,1,0,0,0


In [127]:
beer2.isnull().sum()/len(beer2)*100
beertest2.isnull().sum()/len(beertest2)*100

Size(L)                    0.0
OG                         0.0
FG                         0.0
ABV                        0.0
IBU                        0.0
Color                      0.0
BoilSize                   0.0
BoilTime                   0.0
BoilGravity                0.0
Efficiency                 0.0
PrimaryTemp                0.0
SugarScale_Plato           0.0
BrewMethod_All Grain       0.0
BrewMethod_BIAB            0.0
BrewMethod_Partial Mash    0.0
BrewMethod_extract         0.0
dtype: float64

 __Create target on train data__

 __Normalization__

In [128]:
scaler = StandardScaler(with_mean=0, with_std=1)
scaler.fit(beer2)
X_train_4 = scaler.transform(beer2)
X_test_4 = scaler.transform(beertest2)




 __Random Forest__

In [129]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=101, n_jobs=-1)
rfc.fit(X_train_4, beertarget)
y_pred_rfc_v7 = rfc.predict(X_test_4)



In [130]:
y_pred_rfc_v7 = pd.DataFrame(data = y_pred_rfc_v7, 
                      index = beertest2.index,
                      columns = ['Style'])

y_pred_rfc_v7.to_csv("Submission_RandomForest_v6_extradrop.csv")

