In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import seaborn as sn

In [2]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
test_id = test_data.id

In [3]:
## Data Parsing
features = ['totalSulfurDioxide', 'freeSulfurDioxide', 'residualSugar', 'alcohol', 'volatileAcidity', 'fixedAcidity']

print('Number of features: ', len(features))
x_train = train_data[features]
y_train = train_data.quality


Number of features:  6


In [4]:
# get actual test data
actual_x_test = test_data[features]
actual_x_test_mu = np.mean(actual_x_test)
actual_x_test_sigma = np.std(actual_x_test)
actual_x_test_norm = np.divide((actual_x_test - actual_x_test_mu), actual_x_test_sigma)
print(actual_x_test_norm.head(5))

   totalSulfurDioxide  freeSulfurDioxide  residualSugar   alcohol  \
0           -1.123105          -0.213367      -0.783513 -0.282301   
1           -1.123105          -0.213367      -0.783513 -0.282301   
2           -1.238886          -0.812616      -0.868565 -1.242198   
3           -1.624823          -0.649185      -0.655935  1.717484   
4           -1.721308          -1.193957      -0.677198  1.557501   

   volatileAcidity  fixedAcidity  
0         0.510416      0.353763  
1         0.510416      0.353763  
2         0.694575      1.774963  
3         1.584681     -1.168951  
4        -0.103450      1.571935  


In [5]:
# feature normalization
x_train_mu = np.mean(x_train)
x_train_sigma = np.std(x_train)
x_train_norm = np.divide((x_train - x_train_mu), x_train_sigma)

In [6]:
# train-test-split
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(x_train_norm, y_train, random_state=1, train_size=0.75)

from sklearn.ensemble import RandomForestClassifier

error = []
def get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y, error):
    rf_model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes, random_state=5)
    rf_model.fit(train_x, train_y)
    preds_val = rf_model.predict(val_x)
    mae = mean_absolute_error(val_y, preds_val)
    error.append(mae)
    return(mae)


In [7]:
# Using In Sample Data
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(x_train_norm, y_train)
predicted = rf_model.predict(x_train_norm)
print("\nIn sample MAE " + str(round(mean_absolute_error(y_train, predicted), 10)))


In sample MAE 0.0


In [8]:
# Using Out Sample Data
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(train_x, train_y)
val_predictions = rf_model.predict(val_x)
print("Out Sample MAE " + str(round(mean_absolute_error(val_y, val_predictions), 5)))

Out Sample MAE 0.38434


In [9]:
# get node size
i = 0
node_size = int(input("\nEnter attempt size: "))
print("")
amt_leaf = []
while i != node_size:
    amt_leaf.append(int(input("Enter max leaf nodes: ")))
    i += 1
## take MAE per leaf node
for max_leaf_nodes in amt_leaf:
    mae = get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y, error)


Enter attempt size:  5





Enter max leaf nodes:  10
Enter max leaf nodes:  100
Enter max leaf nodes:  1000
Enter max leaf nodes:  10000
Enter max leaf nodes:  20000


In [10]:
error_sum = pd.Series(error, index=[amt_leaf])
print("\nError Summary\n",error_sum)


Error Summary
 10       0.551451
100      0.490765
1000     0.390501
10000    0.390501
20000    0.390501
dtype: float64


In [13]:
# take best mae
loc_best_mae = error.index(min(error))
best_depth = amt_leaf[loc_best_mae]
print("\nbest depth: ",best_depth)

# creating and fitting rf model with best parameter
rf_model = RandomForestClassifier(max_leaf_nodes=best_depth, random_state=30, max_depth=7)
rf_model.fit(x_train_norm, y_train)

# use test data to predict
rf_output = rf_model.predict(actual_x_test_norm)
print(len(rf_output))
submit = pd.DataFrame({'id':test_id, 'quality':rf_output})
print("Array Shape: ", submit.shape)


best depth:  1000
1949
Array Shape:  (1949, 2)
