# 118A Final Project
- Test 3 datasets on 3 classifiers
- perform 3 different types of cross-validation on your datasets
- use three different train test splits
- use the parameters in the paper to achieve similar results (you can use one metric like classification accuracy)
- report cross-validated classification results w/ corresponding learned hyperparameters
- 3 trials (train,validation,test) * 3 classifiers * 3 datasets * 3 partitions (20/80,50/50,80/20)


1. Build model functions
2. process datasets
3. Build visualizations

In [1]:
%matplotlib inline 
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.neural_network import MLPClassifier
from sklearn import metrics, preprocessing, utils, svm
from sklearn.model_selection import GridSearchCV, cross_val_score, validation_curve

In [7]:
def dataset_eval(X, y):
    #RandomForest with 1024 trees, and varied feature set considered at each split
    rnd_clf = RandomForestClassifier(n_estimators=1024,random_state=0) 
    dt_clf = DecisionTreeClassifier()
    gnb_clf = GaussianNB()
    
    dataset_split_list = [0.2,0.5,0.8]
    for split in dataset_split_list:
        X_train_val, X_test, y_train_val, y_test = train_test_split(X,y,test_size=split,random_state=42)                           
        print("Dataset Split Size: ",split,'\n',X_train_val.shape, X_test.shape, y_train_val.shape, y_test.shape)
        for clf in (gnb_clf,rnd_clf,dt_clf):
            clf.fit(X_train_val,y_train_val)
            cv_score = cross_val_score(clf,X_train_val,y_train_val, cv=5)
            y_pred = clf.predict(X_test)
            print(clf.__class__.__name__,'\n', accuracy_score(y_test,y_pred))
            print("Cross Val Error: ",'\n',cv_score)
    return print("Finished Evaluating")

In [8]:
def load_df(df):
    lab_enc = preprocessing.LabelEncoder()
    dataset = adult_df.apply(lab_enc.fit_transform)
    data = `dataset.values
    return data

# Adult Dataset

In [9]:
adult_df = pd.read_csv('dataset/adult.data.csv')
data = load_df(adult_df)
# label data and split them
X = data[:, 0:14]
y = data[:, 14]
dataset_eval(X,y)

Dataset Split Size:  0.2 
 (26048, 14) (6512, 14) (26048,) (6512,)
GaussianNB 
 0.8227886977886978
Cross Val Error:  
 [0.8244099  0.82303263 0.8218468  0.81531964 0.82127088]
RandomForestClassifier 
 0.8558046683046683
Cross Val Error:  
 [0.85933602 0.86065259 0.85601843 0.85429065 0.85928201]
DecisionTreeClassifier 
 0.804514742014742
Cross Val Error:  
 [0.8019574  0.81880998 0.81493569 0.8078326  0.8093684 ]
Dataset Split Size:  0.5 
 (16280, 14) (16280, 14) (16280,) (16280,)
GaussianNB 
 0.8235872235872236
Cross Val Error:  
 [0.81854467 0.82432432 0.81449631 0.81695332 0.82457757]
RandomForestClassifier 
 0.8597051597051597
Cross Val Error:  
 [0.85262512 0.85165848 0.85718673 0.85380835 0.86021505]
DecisionTreeClassifier 
 0.8148034398034398
Cross Val Error:  
 [0.801965   0.81511057 0.81019656 0.80405405 0.81044547]
Dataset Split Size:  0.8 
 (6512, 14) (26048, 14) (6512,) (26048,)
GaussianNB 
 0.819794226044226
Cross Val Error:  
 [0.82118189 0.80276285 0.82949309 0.81490015 

# Wine Dataset

In [10]:
from sklearn.datasets import load_wine
features, targets = load_wine(return_X_y=True)
dataset_eval(features,targets)

Dataset Split Size:  0.2 
 (142, 13) (36, 13) (142,) (36,)
GaussianNB 
 1.0
Cross Val Error:  
 [1.         0.96551724 0.96428571 0.89285714 1.        ]
RandomForestClassifier 
 1.0
Cross Val Error:  
 [1.         0.96551724 0.92857143 0.96428571 1.        ]
DecisionTreeClassifier 
 0.9444444444444444
Cross Val Error:  
 [0.93103448 0.93103448 0.89285714 0.92857143 0.89285714]
Dataset Split Size:  0.5 
 (89, 13) (89, 13) (89,) (89,)
GaussianNB 
 0.9887640449438202
Cross Val Error:  
 [0.9        1.         0.88235294 0.94117647 1.        ]
RandomForestClassifier 
 0.9775280898876404
Cross Val Error:  
 [0.95       1.         0.88235294 1.         1.        ]
DecisionTreeClassifier 
 0.9101123595505618
Cross Val Error:  
 [0.95       0.94444444 0.94117647 1.         0.88235294]
Dataset Split Size:  0.8 
 (35, 13) (143, 13) (35,) (143,)
GaussianNB 
 0.9440559440559441
Cross Val Error:  
 [1.         0.85714286 0.85714286 0.85714286 1.        ]
RandomForestClassifier 
 0.951048951048951
C

# Wine Classification Data Set

In [12]:
# The Wine classification dataset
# 3 classes that correspond to some typical wine varietals
white_wine_df = pd.read_csv('dataset/winequality-white.csv', sep=';',quotechar='"')
red_wine_df = pd.read_csv('dataset/winequality-red.csv', sep=';', quotechar='"')
red_wine_df.shape
red_wine_df.append(white_wine_df)
red_wine_df.shape
data = load_df(red_wine_df)
X = data[:, 0:11]
y= data[:, 11]
dataset_eval(X, y)

Dataset Split Size:  0.2 
 (26048, 11) (6512, 11) (26048,) (6512,)




GaussianNB 
 0.15202702702702703
Cross Val Error:  
 [0.16073129 0.1606049  0.14669739 0.13627602 0.14178095]




RandomForestClassifier 
 0.9508599508599509
Cross Val Error:  
 [0.94477242 0.94946401 0.95142089 0.95508867 0.95711802]




DecisionTreeClassifier 
 0.8948095823095823
Cross Val Error:  
 [0.89259189 0.90026799 0.89458525 0.90246723 0.90361213]
Dataset Split Size:  0.5 
 (16280, 11) (16280, 11) (16280,) (16280,)




GaussianNB 
 0.14975429975429974
Cross Val Error:  
 [0.14732685 0.16009777 0.14281317 0.11828289 0.14776952]




RandomForestClassifier 
 0.9523955773955773
Cross Val Error:  
 [0.94137303 0.94592117 0.95321637 0.95676343 0.9594176 ]




DecisionTreeClassifier 
 0.9017199017199017
Cross Val Error:  
 [0.88851762 0.88573174 0.89843029 0.89376158 0.89931846]
Dataset Split Size:  0.8 
 (6512, 11) (26048, 11) (6512,) (26048,)




GaussianNB 
 0.19037929975429976
Cross Val Error:  
 [0.22230539 0.23165138 0.2147806  0.21550388 0.20172009]




RandomForestClassifier 
 0.9526259213759214
Cross Val Error:  
 [0.92739521 0.94571865 0.95150115 0.96046512 0.96637998]
DecisionTreeClassifier 
 0.8899723587223587
Cross Val Error:  
 [0.88023952 0.88685015 0.89376443 0.9        0.90070367]
Finished Evaluating




# Letter Recognition Data Set

In [15]:
letter_df = pd.read_csv('dataset/letter-recognition.data.csv')
data = load_df(letter_df)
print(data.shape)
X = data[:,1:17]
y = data[:,0]
print(X.shape,y.shape)
dataset_eval(X,y)

(32560, 15)
(32560, 14) (32560,)
Dataset Split Size:  0.2 
 (26048, 14) (6512, 14) (26048,) (6512,)




GaussianNB 
 0.03163390663390663
Cross Val Error:  
 [0.03758108 0.03463452 0.03342297 0.03139445 0.01871864]




RandomForestClassifier 
 0.056511056511056514
Cross Val Error:  
 [0.05455933 0.06027555 0.05839416 0.05874422 0.055577  ]




DecisionTreeClassifier 
 0.048525798525798525
Cross Val Error:  
 [0.04921786 0.05472637 0.05474453 0.05546995 0.05306831]
Dataset Split Size:  0.5 
 (16280, 14) (16280, 14) (16280,) (16280,)




GaussianNB 
 0.03396805896805897
Cross Val Error:  
 [0.03772437 0.03147922 0.0298278  0.02809509 0.02105263]




RandomForestClassifier 
 0.05718673218673219
Cross Val Error:  
 [0.05202312 0.05745721 0.06119311 0.0521766  0.06006192]




DecisionTreeClassifier 
 0.051597051597051594
Cross Val Error:  
 [0.05019775 0.05103912 0.05565806 0.05001544 0.05139319]
Dataset Split Size:  0.8 
 (6512, 14) (26048, 14) (6512,) (26048,)




GaussianNB 
 0.02671990171990172
Cross Val Error:  
 [0.01651652 0.02045455 0.02617398 0.02950311 0.02278083]




RandomForestClassifier 
 0.055781633906633904
Cross Val Error:  
 [0.05855856 0.05454545 0.05003849 0.05900621 0.06048704]




DecisionTreeClassifier 
 0.04990786240786241
Cross Val Error:  
 [0.05405405 0.04090909 0.04849885 0.04658385 0.05891595]
Finished Evaluating


# Forest Covertype Data Set

In [22]:
from collections import Counter
data = np.loadtxt('dataset/covtype.data', delimiter=',')
covar_data.shape
# Find the most common classes
count = Counter(data[:,54])
print(count.most_common(3))
for i in range(len(data[:,54])):
    if data[i,54] == 2:
        data[i,54] = 1
    else:
        data[i,54] = 0
print(data)
print(data.shape)

data_continuous = data[:,0:10]
d_s = StandardScaler()
d_n = d_s.fit_transform(data_continuous)

X_and_Y = np.hstack([d_n, data[:,10:56]])
print(X_and_Y)

np.random.shuffle(X_and_Y)

# Try to predict the last column (the cover type, which is 7 types)
# NOTE: the largest number is "positive", while the other ones count as negative. This way it is a binary classification problem
X = X_and_Y[:5000,0:-1]
y = X_and_Y[:5000,-1]
print(X.shape, y.shape)
dataset_eval(X,y)

[(2.0, 283301), (1.0, 211840), (3.0, 35754)]
[[2.596e+03 5.100e+01 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.590e+03 5.600e+01 2.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.804e+03 1.390e+02 9.000e+00 ... 0.000e+00 0.000e+00 1.000e+00]
 ...
 [2.386e+03 1.590e+02 1.700e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.384e+03 1.700e+02 1.500e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.383e+03 1.650e+02 1.300e+01 ... 0.000e+00 0.000e+00 0.000e+00]]
(581012, 55)
[[-1.29780509 -0.93515698 -1.48281978 ...  0.          0.
   0.        ]
 [-1.31923485 -0.89047967 -1.61636259 ...  0.          0.
   0.        ]
 [-0.5549068  -0.14883628 -0.68156292 ...  0.          0.
   1.        ]
 ...
 [-2.04784663  0.02987297  0.38677957 ...  0.          0.
   0.        ]
 [-2.05498988  0.12816306  0.11969395 ...  0.          0.
   0.        ]
 [-2.05856151  0.08348575 -0.14739167 ...  0.          0.
   0.        ]]
(5000, 54) (5000,)
Dataset Split Size:  0.2 
 (4000, 54) (1000, 54) (4000,) (1000,)
Gaussi