In [1]:
# load modules and functions
from scipy.io import arff
import urllib.request
import pandas as pd
import numpy as np
import io
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
#load dataset from github

url_train = "https://raw.githubusercontent.com/Carloszone/ALY-6020/master/Week%201/FaceAll/FaceAll_TRAIN.arff"
url_test = "https://raw.githubusercontent.com/Carloszone/ALY-6020/master/Week%201/FaceAll/FaceAll_TEST.arff"

arff_train = urllib.request.urlopen(url_train)
arff_test = urllib.request.urlopen(url_test)
data_train = arff.loadarff(io.StringIO(arff_train.read().decode('utf-8')))
data_test = arff.loadarff(io.StringIO(arff_test.read().decode('utf-8')))


In [3]:
df_train = pd.DataFrame(data_train[0])
df_train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att123,att124,att125,att126,att127,att128,att129,att130,att131,target
0,-0.247592,-0.332503,-0.632105,-1.024779,-1.426342,-1.580426,-1.336747,-1.060614,-0.723881,-0.617905,...,-0.273452,-0.077336,-0.119991,-0.350027,-0.490925,-0.142705,0.02771,0.09079,-0.327312,b'1'
1,-0.641577,-0.93942,-1.300174,-1.385949,-1.269212,-1.036537,-0.758694,-0.63115,-0.581992,-0.539002,...,-0.607773,-0.939176,-0.640406,-0.191614,0.062604,0.030992,-0.388804,-0.581837,-0.24863,b'1'
2,-1.672048,-1.881515,-2.090981,-1.697734,-1.29507,-0.822547,-0.347806,0.73743,1.852206,2.448954,...,0.058114,0.370807,0.014227,-0.325895,-0.325895,-0.325895,-0.325895,-0.325895,-0.325895,b'1'
3,-0.463782,-1.111783,-1.747124,-2.112366,-2.409313,-2.011936,-1.557494,-0.735304,0.059425,0.728291,...,-0.061193,-0.06225,-0.062439,-0.062439,-0.370382,-0.72611,-0.760803,-0.76392,-0.76392,b'1'
4,-0.764739,-1.118529,-1.464805,-1.650747,-1.79088,-1.465283,-1.153654,-0.932041,-0.664873,-0.188906,...,-0.372712,-0.489895,-0.414688,-0.297505,-0.178539,-0.059297,-0.265746,-0.50423,-0.50423,b'1'


In [4]:
df_test = pd.DataFrame(data_test[0])
df_test.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att123,att124,att125,att126,att127,att128,att129,att130,att131,target
0,-0.637379,-1.163642,-1.560159,-1.607592,-1.369525,-1.028345,-0.796911,-0.673459,-0.606461,-0.121185,...,-0.387549,-0.793876,-0.731225,-0.333497,-0.168134,-0.059181,-0.370224,-0.578104,-0.296681,b'1'
1,-0.262558,-0.673262,-1.083966,-1.4343,-1.781666,-1.76029,-1.700768,-1.416769,-1.096037,-0.308092,...,-0.269868,-0.337655,-0.234648,-0.175097,-0.381111,-0.558154,-0.455147,-0.455147,-0.455147,b'1'
2,-0.647722,-0.85273,-1.057737,-1.377993,-1.703917,-1.472404,-1.183225,-0.973524,-0.776829,-0.419171,...,-0.253859,-0.374525,-0.580565,-0.743433,-0.642469,-0.541118,-0.436043,-0.436043,-0.436043,b'1'
3,-0.168912,-0.707683,-1.246455,-1.258758,-1.262834,-1.286919,-1.311639,-1.232654,-1.148651,-0.767647,...,-0.336361,-0.329791,-0.223019,-0.126104,-0.232875,-0.333076,-0.226304,-0.226304,-0.226304,b'1'
4,-0.846935,-1.148083,-1.449231,-1.52779,-1.595401,-1.422643,-1.225018,-1.183937,-1.168472,-0.752264,...,-0.254825,-0.750571,-0.646832,-0.527317,-0.311391,-0.116104,-0.120328,-0.120328,-0.120328,b'1'


In [10]:
# transform target into numeric string
df_train['target'] = df_train.target.astype(str)
df_test['target'] = df_test.target.astype(str)

In [18]:
# extract X and Y from df_train
X = df_train.iloc[:,:-1]
Y = df_train.iloc[:,-1]

# Split dataset
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.2, random_state = 2021)

train_x = np.array(train_x)
train_y = np.array(train_y)
test_x = np.array(test_x)
test_y = np.array(test_y)

In [23]:
X_test = df_test.iloc[:,:-1]
Y_test = df_test.iloc[:,-1]

In [30]:
def rf(train_x, train_y, test_x, test_y, X_test, Y_test, n_nums, leaf_nums, Standardize = False):
   
    #Standardize train/test set
    if Standardize:
        sc = StandardScaler()
        sc.fit(train_x)
        train_x = sc.transform(train_x)
        test_x = sc.transform(test_x)
        X_test = sc.transform(X_test)
    
    # stroe model information
    tree_nums = []
    minimum_leaf_sample = []
    train_accuracy = []
    test_accuracy = []
    accuracy = []
    
    # train model
    for n in n_nums:
        for leaf in leaf_nums:
            model = RandomForestClassifier(n_estimators = n, min_samples_leaf = leaf, random_state = 2021)
            model.fit(train_x, train_y)
            train_pred = model.predict(train_x)
            test_pred = model.predict(test_x)
            pred = model.predict(X_test)
        
            tree_nums.append(n)
            minimum_leaf_sample.append(leaf)
            train_accuracy.append(accuracy_score(train_y,train_pred))
            test_accuracy.append(accuracy_score(test_y,test_pred))
            accuracy.append(accuracy_score(Y_test,pred))
    
    # return result 
    result = pd.DataFrame({
        'tree_nums':tree_nums,
        'minimum_leaf_sample':minimum_leaf_sample,
        'split_train_accuracy': train_accuracy,
        'split_test_accuracy':test_accuracy,
        'test_accuracy':accuracy})
    return result

In [31]:
n_nums = [10,50,100,500]
leaf_nums = [1,10,20]
rf(train_x, train_y, test_x, test_y, X_test, Y_test, n_nums, leaf_nums)

Unnamed: 0,tree_nums,minimum_leaf_sample,split_train_accuracy,split_test_accuracy,test_accuracy
0,10,1,0.997768,0.723214,0.606509
1,10,10,0.877232,0.598214,0.546154
2,10,20,0.752232,0.5,0.488166
3,50,1,1.0,0.875,0.743787
4,50,10,0.953125,0.714286,0.695266
5,50,20,0.848214,0.633929,0.601183
6,100,1,1.0,0.875,0.764497
7,100,10,0.959821,0.723214,0.710059
8,100,20,0.859375,0.660714,0.630178
9,500,1,1.0,0.901786,0.784024


In [32]:
rf(train_x, train_y, test_x, test_y, X_test, Y_test, n_nums, leaf_nums, True)

Unnamed: 0,tree_nums,minimum_leaf_sample,split_train_accuracy,split_test_accuracy,test_accuracy
0,10,1,0.997768,0.723214,0.606509
1,10,10,0.877232,0.598214,0.546154
2,10,20,0.752232,0.5,0.488166
3,50,1,1.0,0.875,0.743787
4,50,10,0.953125,0.714286,0.695266
5,50,20,0.848214,0.633929,0.601183
6,100,1,1.0,0.875,0.764497
7,100,10,0.959821,0.723214,0.710059
8,100,20,0.859375,0.660714,0.630178
9,500,1,1.0,0.901786,0.784024
