In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from prettytable import PrettyTable

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

import warnings 
warnings.filterwarnings('ignore')

In [219]:
read_data = pd.read_csv('/Users/ankusmanish/Desktop/Training/Datasets/Week10/1625Data.txt', sep = ',', header = None)
output_variable = pd.DataFrame(read_data[1])
data = pd.DataFrame([list(read_data[0][i]) for i in range(len(read_data))])
data = pd.concat([data, output_variable], axis = 1)
data.columns = ['col_1','col_2','col_3','col_4','col_5','col_6','col_7','col_8','output']

In [220]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1625 entries, 0 to 1624
Data columns (total 9 columns):
col_1     1625 non-null object
col_2     1625 non-null object
col_3     1625 non-null object
col_4     1625 non-null object
col_5     1625 non-null object
col_6     1625 non-null object
col_7     1625 non-null object
col_8     1625 non-null object
output    1625 non-null int64
dtypes: int64(1), object(8)
memory usage: 114.3+ KB


In [221]:
data.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,output
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [222]:
data.nunique()

col_1     20
col_2     20
col_3     20
col_4     20
col_5     20
col_6     20
col_7     20
col_8     20
output     2
dtype: int64

In [223]:
X = data.iloc[:,:-1].values
X = pd.DataFrame(X)
y = data.iloc[:,-1].values
y = np.reshape(y, (len(y),1))

In [224]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,S,L,N,L,R,E,T,N
1,A,E,C,F,R,I,F,D
2,H,L,V,E,A,L,Y,L
3,T,Q,I,M,F,E,T,F
4,A,E,E,L,A,E,I,F


In [225]:
lb = LabelEncoder()
X_data = X.apply(lb.fit_transform)

In [226]:
ohe = OneHotEncoder(categorical_features = 'all')
X = ohe.fit_transform(X_data).toarray()

In [227]:
X.shape

(1625, 160)

In [228]:
data['output'].value_counts()

-1    1250
 1     375
Name: output, dtype: int64

In [229]:
data.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,output
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [230]:
algos = {'DecisionTreeClassifier':DecisionTreeClassifier(), 'RandomForestClassifier':RandomForestClassifier(), 
         'LogisticRegression':LogisticRegression(),
         'Support Vector Classifier':SVC(), 'KNeighborsClassifier':KNeighborsClassifier()}

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [232]:
def algorithms(X_train, X_test, y_train, y_test):
    
    t = PrettyTable(['Classifier', 'Accuracy'])
    
    for key, value in algos.items():
        model = value
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        
        print(key.upper(),'\n')
        t.add_row([key, accuracy_score(y_pred, y_test)])
        print('Calssification Report : \n {}'.format(classification_report(y_pred, y_test)))
        print('Confusion Matrix : \n {}'.format(confusion_matrix(y_pred, y_test)))
        print('Accuracy Score : \n {}'.format(accuracy_score(y_pred, y_test)))
        print('\n')
        print('-' * 100)
    print(t)

In [217]:
algorithms(X_train, X_test, y_train, y_test)

DECISIONTREECLASSIFIER 

Calssification Report : 
               precision    recall  f1-score   support

          -1       0.92      0.96      0.94       387
           1       0.87      0.77      0.82       150

    accuracy                           0.91       537
   macro avg       0.89      0.86      0.88       537
weighted avg       0.90      0.91      0.90       537

Confusion Matrix : 
 [[370  17]
 [ 34 116]]
Accuracy Score : 
 0.9050279329608939


----------------------------------------------------------------------------------------------------
RANDOMFORESTCLASSIFIER 

Calssification Report : 
               precision    recall  f1-score   support

          -1       0.98      0.95      0.96       417
           1       0.83      0.93      0.88       120

    accuracy                           0.94       537
   macro avg       0.91      0.94      0.92       537
weighted avg       0.95      0.94      0.94       537

Confusion Matrix : 
 [[395  22]
 [  9 111]]
Accuracy Score 