## Importing the libraries

In [64]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, ensemble, tree
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
import matplotlib.pyplot as plt 

## Importing the data

In [65]:
winequality_combined_training_df = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-combined-training.csv')
print(winequality_combined_training_df.shape)

winequality_combined_testing_df = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-combined-testing.csv')
print(winequality_combined_testing_df.shape)


(5097, 13)
(1400, 12)


In [66]:
train = winequality_combined_training_df
test = winequality_combined_testing_df

In [67]:
x = winequality_combined_training_df.iloc[:,1:12]
y = winequality_combined_training_df.iloc[:,12]

In [68]:
x_test = winequality_combined_testing_df.iloc[:,1:12]

## Implementing the decision tree classifier

In [69]:
classifier = tree.DecisionTreeClassifier()
base_results = cross_validate(classifier, x, y, cv  = 10, return_train_score=True)
classifier.fit(x, y)

epoch=0
for train_score,test_score in zip(base_results['train_score'], base_results['test_score']):
        epoch +=1       
        print("epoch:",epoch,"train_score:",train_score, "validation_score:",test_score)
print('-'*10)

print('Parameters of the classifier: ', classifier.get_params())
print('-'*10)
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Validation score mean: {:.2f}". format(base_results['test_score'].mean()*100))
oft_score = base_results['train_score'].mean() - base_results['test_score'].mean()
print("Overfitting: {:.2f}". format(oft_score*100))
print('-'*10)

epoch: 1 train_score: 0.999781992587748 validation_score: 0.984313725490196
epoch: 2 train_score: 0.999781992587748 validation_score: 0.9882352941176471
epoch: 3 train_score: 0.999781992587748 validation_score: 0.996078431372549
epoch: 4 train_score: 0.999781992587748 validation_score: 0.9784313725490196
epoch: 5 train_score: 0.999781992587748 validation_score: 0.9803921568627451
epoch: 6 train_score: 1.0 validation_score: 0.9823529411764705
epoch: 7 train_score: 0.999781992587748 validation_score: 0.9941176470588236
epoch: 8 train_score: 0.999781992587748 validation_score: 0.984313725490196
epoch: 9 train_score: 1.0 validation_score: 0.9862475442043221
epoch: 10 train_score: 0.9997820876007845 validation_score: 0.9901574803149606
----------
Parameters of the classifier:  {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'mi

## Implementing the Random forest classifier to reduce the overfitting

In [83]:
classifier = ensemble.RandomForestClassifier()
base_results = cross_validate(classifier, x, y, cv  = 10, return_train_score=True)
classifier.fit(x, y)

epoch=0
for train_score,test_score in zip(base_results['train_score'], base_results['test_score']):
        epoch +=1       
        print("epoch:",epoch,"train_score:",train_score, "validation_score:",test_score)
print('-'*10)

print('Parameters of the classifier: ', classifier.get_params())
print('-'*10)
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Validation score mean: {:.2f}". format(base_results['test_score'].mean()*100))
oft_score = base_results['train_score'].mean() - base_results['test_score'].mean()
print("Overfitting: {:.2f}". format(oft_score*100))
print('-'*10)

epoch: 1 train_score: 0.999345977763244 validation_score: 0.9941176470588236
epoch: 2 train_score: 0.9995639851754959 validation_score: 0.9901960784313726
epoch: 3 train_score: 0.999345977763244 validation_score: 1.0
epoch: 4 train_score: 0.9995639851754959 validation_score: 0.984313725490196
epoch: 5 train_score: 0.9991279703509919 validation_score: 0.9980392156862745
epoch: 6 train_score: 0.999345977763244 validation_score: 0.9941176470588236
epoch: 7 train_score: 0.999345977763244 validation_score: 0.9980392156862745
epoch: 8 train_score: 0.9991279703509919 validation_score: 0.9980392156862745
epoch: 9 train_score: 0.999128160418483 validation_score: 0.9921414538310412
epoch: 10 train_score: 0.999128350403138 validation_score: 0.9921259842519685
----------
Parameters of the classifier:  {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_sample

In [84]:
predicted = classifier.predict(x_test)
df = pd.DataFrame()
df['Id'] = test['Id']
df['type']=predicted
df.to_csv('winequality_combined_solution.csv', index=False)

## Hyperparameter tuning for random forest using grid search 

In [None]:
classifier = ensemble.RandomForestClassifier()
base_results = cross_validate(classifier, x, y, cv  = None, return_train_score=True)
classifier.fit(x, y)

epoch=0
for train_score,test_score in zip(base_results['train_score'], base_results['test_score']):
        epoch +=1       
        print("epoch:",epoch,"train_score:",train_score, "validation_score:",test_score)
print('-'*10)

print('BEFORE Tuning Parameters: ', classifier.get_params())
print('-'*10)
print("BEFORE Tuning Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Tuning validation score mean: {:.2f}". format(base_results['test_score'].mean()*100))
oft_score = base_results['train_score'].mean() - base_results['test_score'].mean()
print("Overfitting before tuning: {:.2f}". format(oft_score*100))
print('-'*10)

param_grid = {'n_estimators': [50, 150, 200],
              'criterion': ['gini','entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              'max_depth': [2,4,6,None], #max depth tree can grow; default is none
              'min_samples_split': [5,7,10], #minimum subset size BEFORE new split (fraction is % of total); default is 2
              #'min_samples_leaf': [1,3,5], #minimum subset size AFTER new split split (fraction is % of total); default is 1
              'max_features': [2,3,'auto'], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

tune_model = GridSearchCV(ensemble.RandomForestClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = None, return_train_score=True)
tune_model.fit(x, y)

for i in range(3):
    print("epoch:",i,"train_score:",tune_model.cv_results_['split'+str(i)+'_train_score'][tune_model.best_index_],
    "test_score:",tune_model.cv_results_['split'+str(i)+'_test_score'][tune_model.best_index_])

print('-'*10)    


print('AFTER Tuning Parameters: ', tune_model.best_params_)
print('-'*10)
print("AFTER Tuning Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100))
print("AFTER Tuning validation score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
oft_score = tune_model.cv_results_['mean_train_score'][tune_model.best_index_] - tune_model.cv_results_['mean_test_score'][tune_model.best_index_]
print("Overfitting after tuning: {:.2f}". format(oft_score*100))
print('-'*10)

## Now using this random forest model trained on combined dataset with high training accuracy, let us predict the red wine dataset and check the confuion matrix

In [87]:
red_train = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-red-training.csv')
print(red_train.shape)

red_test = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-red-testing.csv')
print(red_test.shape)

red_x_train = red_train.iloc[:,1:12]
red_y_train = np.ones(1199)

(1199, 13)
(400, 12)


In [88]:
pred = classifier.predict(red_x_train)
confusion_matrix(red_y_train, pred)

array([[   0,    0],
       [   2, 1197]], dtype=int64)

In [89]:
print(pred)

[1 1 1 ... 1 1 1]


## Here we can observe that out of 1199 red wine data points, the model trained on the combined dataset is predicting 1197 data points as red wine. So, in this context the model can be reused and higher the model calssification accuracy, better the advantage of reusability. 