In [None]:
#---------------------------------------------------------------------------------------------------//
#---- Date: 26 Oct 2018
#---- By : Ahmed Eissa
#---- Description: Grid Search for Random forest hyper parameters
#---- Dataset: SUSY [ https://archive.ics.uci.edu/ml/datasets/SUSY ]
#---------------------------------------------------------------------------------------------------//

In [1]:
#import needed package
#---------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import datetime
from sklearn import preprocessing
import datetime

In [4]:
#Load Data from CSV
#--------------------------------------------------------
filepath = 'D:\Technology\Auto-ML\Data\SUSY.csv'
rawdata = pd.read_csv( filepath  , sep=',' , low_memory=False)
rawdata.shape

(4999999, 19)

In [None]:
#Check the data
#-------------------------------------------------------
# 1- display statistics
rawdata.describe()

In [None]:
# 2- display datatypes
print (rawdata.dtypes)

In [5]:
# 3- display datatypes
Columns = ['Label' , 'c1' , 'c2' , 'c3' , 'c4' , 'c5' , 'c6' ,\
           'c7' , 'c8' , 'c9' , 'c10' , 'c11' , 'c12' , 'c13' , \
           'c14' , 'c15' , 'c16' , 'c17' , 'c18']
rawdata.columns = Columns
# 4 - check  number of clasess
rawdata.Label.unique()

array([1., 0.])

In [6]:
# 5 - cast Label
rawdata[['Label']] = rawdata[['Label']].astype(int)

In [7]:
# split data into Testing and training
# ---------------------------------------------------------------------------------------------------------------------
np.random.seed(1234)
msk = np.random.rand(len(rawdata)) < 0.8
training_data = rawdata[msk]
Testing_data = rawdata[~msk]

X_train = training_data.loc[:, training_data.columns != 'Label']
Y_train = training_data.loc[:, training_data.columns == 'Label']

X_test = Testing_data.loc[:, Testing_data.columns != 'Label']
Y_test = Testing_data.loc[:, Testing_data.columns == 'Label']

In [None]:
# Save Training and Testing Data to CSV
#-------------------------------------------------------------------------------------------------------------------
training_data.to_csv('C:\\Data\\training_data.csv', sep=',', encoding='utf-8' )
Testing_data.to_csv('C:\\Data\\testing_data.csv', sep=',', encoding='utf-8' )

In [9]:
# Decison Tree
#-------------------------------------------------------------------------------------------------------------------
def DT_Train( maxdepth) :
    t1 = datetime.datetime.now(max_depth)
    clf = DecisionTreeClassifier(criterion='gini', max_depth=maxdepth,
                max_features=None, max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=2, min_samples_split=2,splitter='best')
    clf.fit(X_train, Y_train)
    t2 = datetime.datetime.now()
    y_pred = clf.predict(X_test)    
    result = str(maxdepth) +  "," + str( accuracy_score(Y_test,y_pred)*100) + "," + str((t2 - t1).total_seconds()) + "\n"
    return result

Time:202.101269
Accuracy:78.95200744349845


In [None]:
# Random Forest
#---------------------------------------------------------------------------------------------------------------------
def RF_Train( trees, maxdepth, impurity , features) :
    t1 = datetime.datetime.now()
    clf = RandomForestClassifier(n_estimators =  trees , max_depth= maxdepth , criterion=impurity , max_features=features )
    clf.fit(X_train, Y_train.values.ravel())
    t2 = datetime.datetime.now()

    # Predict & Score
    #----------------------------------------------------------------
    y_pred = clf.predict(X_test)
    result = str(trees) + "," + str(maxdepth) + "," + str(impurity) + "," + str(features) + "," + str( accuracy_score(Y_test,y_pred)*100) + "," + str((t2 - t1).total_seconds()) + "\n"
    return result

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_Scaled = scaler.transform(X_train)
X_test_Scaled = scaler.transform(X_test)

In [None]:
# MLP
#-------------------------------------------------------------------------------------------------------------------

t1 = datetime.datetime.now()
clf = MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto', 
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, solver='lbfgs', tol=0.0001)
clf.fit(X_train_Scaled, Y_train.values.ravel())
t2 = datetime.datetime.now()
print("Time:" + str((t2 - t1).total_seconds()))

y_pred = clf.predict(X_test_Scaled)
print("Accuracy:" + str(accuracy_score(Y_test,y_pred)*100))

In [None]:
# Logistic Regression
#--------------------------------------------------------------------------------------------------------------------
t1 = datetime.datetime.now()
clf = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', solver='liblinear', tol=0.0001)
clf.fit(X_train_Scaled, Y_train.values.ravel())
t2 = datetime.datetime.now()
print("Time:" + str((t2 - t1).total_seconds()))

y_pred = clf.predict(X_test_Scaled)
print("Accuracy:" + str(accuracy_score(Y_test,y_pred)*100))

In [None]:
res = "" ;
Results = [""] 
trees = [10,50,100]
maxdepth = [5,10,15]
impurity = ['gini', 'entropy']
features = ['all', 'sqrt' , 'log2']


for i in range(len(trees)):
    for j in range(len(maxdepth)):
        for k in range(len(impurity)):
            for l in range(len(features)):
                if (features[l] == 'all'):
                    Results.append(RF_Train(trees[i],maxdepth[j],impurity[k],1.0)) 
                else:
                    #Results.append(RF_Train(trees[i],maxdepth[j],impurity[k],features[l]))
                    res = res +  RF_Train(trees[i],maxdepth[j],impurity[k],features[l])


print(res) 