In [3]:
# Required Python Packages
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
 
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



In [4]:
dataset_clean_dir = "dataset_clean.csv"
dataset_clean = pd.read_csv(dataset_clean_dir,sep=',')

In [5]:
dataset_clean

Unnamed: 0,BIB,COD,ERG,FAN,GJAH,LUK,MYR,NUS,PKD,SIS,TOK,UIN,VOL,WET,KAT,XIN,Class
0,160,iii,www,80.000000,iii,5.0,eee,8.000000e+05,xxx,1.750,t,17.920000,f,1,ccc,t,n
1,153,uuu,aaa,200.000000,rrr,0.0,mmm,2.000000e+06,xxx,0.290,f,16.920000,f,0,ddd,f,n
2,5,iii,www,96.000000,iii,19.0,hh,9.600000e+05,hh,0.000,f,31.250000,f,1,ddd,t,n
3,9,iii,www,0.000000,iii,120.0,kkk,0.000000e+00,qqq,0.335,f,48.170000,f,0,ccc,f,n
4,40,iii,www,232.000000,iii,0.0,mmm,2.320000e+06,xxx,0.500,t,32.330000,f,0,ddd,f,n
5,8,iii,aaa,160.000000,rrr,0.0,kkk,1.600000e+06,jjj,0.500,t,34.830000,f,0,ccc,f,n
6,152,iii,www,276.000000,iii,1.0,lll,2.760000e+06,lll,0.000,t,26.170000,f,0,ccc,f,n
7,176,iii,aaa,280.000000,rrr,204.0,eee,2.800000e+06,jjj,0.250,f,21.170000,f,0,ddd,f,n
8,154,iii,www,220.000000,iii,140.0,eee,2.200000e+06,xxx,0.290,f,28.920000,f,0,ddd,f,n
9,95,iii,www,320.000000,iii,13.0,eee,3.200000e+06,jjj,1.085,f,18.170000,f,0,ddd,f,n


## Processing Numerical Attributes


Since all the numerical attributes have different scales, this would add a negative impact on the classifier. There are several ways one could normalize these attributes, but since there are no negative values, Ill chose a MinMax normalization in a range of 0-1.

In [6]:
from sklearn import preprocessing
import pandas
#x = dataset_clean['BIB','FAN','LUK','NUS','SIS','UIN'].values #returns a numpy array\
for var in ['BIB','FAN','LUK','NUS','SIS','UIN','WET']:
    x = dataset_clean[var].values #returns a numpy array
    x =x.reshape(-1,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    dataset_clean[var] = pandas.DataFrame(x_scaled)


Data with input dtype int64 was converted to float64 by MinMaxScaler.



## Replacing Class target (y/n) by (1/0) 

In [7]:
dataset_clean.replace(('y', 'n'), (1, 0), inplace=True)

In [8]:
dataset_clean

Unnamed: 0,BIB,COD,ERG,FAN,GJAH,LUK,MYR,NUS,PKD,SIS,TOK,UIN,VOL,WET,KAT,XIN,Class
0,0.892655,iii,www,0.068966,iii,0.00005,eee,0.068966,xxx,0.061404,t,0.069884,f,0.014925,ccc,t,0
1,0.853107,uuu,aaa,0.172414,rrr,0.00000,mmm,0.172414,xxx,0.010175,f,0.053126,f,0.000000,ddd,f,0
2,0.016949,iii,www,0.082759,iii,0.00019,hh,0.082759,hh,0.000000,f,0.293280,f,0.014925,ddd,t,0
3,0.039548,iii,www,0.000000,iii,0.00120,kkk,0.000000,qqq,0.011754,f,0.576839,f,0.000000,ccc,f,0
4,0.214689,iii,www,0.200000,iii,0.00000,mmm,0.200000,xxx,0.017544,t,0.311379,f,0.000000,ddd,f,0
5,0.033898,iii,aaa,0.137931,rrr,0.00000,kkk,0.137931,jjj,0.017544,t,0.353276,f,0.000000,ccc,f,0
6,0.847458,iii,www,0.237931,iii,0.00001,lll,0.237931,lll,0.000000,t,0.208145,f,0.000000,ccc,f,0
7,0.983051,iii,aaa,0.241379,rrr,0.00204,eee,0.241379,jjj,0.008772,f,0.124351,f,0.000000,ddd,f,0
8,0.858757,iii,www,0.189655,iii,0.00140,eee,0.189655,xxx,0.010175,f,0.254232,f,0.000000,ddd,f,0
9,0.525424,iii,www,0.275862,iii,0.00013,eee,0.275862,jjj,0.038070,f,0.074074,f,0.000000,ddd,f,0


## Handling categorical variables

There are many ways to deal with categorical attributes. In this case we will start by one-hot encoding the categorical attributes. One could perform a custom encoding or dimensionality reduction on the attribute MYR for example, since it has many different categorical values. If we achieve poor accuracy, we will backtrack and handle the categorical variables in a different manner.

In [9]:
dataset_encoded = pandas.get_dummies(dataset_clean, columns=["COD", "ERG","GJAH","MYR","PKD","TOK","VOL","KAT","XIN"], prefix=["COD", "ERG","GJAH","MYR","PKD","TOK","VOL","KAT","XIN"])

In [10]:
dataset_encoded

Unnamed: 0,BIB,FAN,LUK,NUS,SIS,UIN,WET,Class,COD_iii,COD_rrr,...,PKD_qqq,PKD_xxx,TOK_f,TOK_t,VOL_f,VOL_t,KAT_ccc,KAT_ddd,XIN_f,XIN_t
0,0.892655,0.068966,0.00005,0.068966,0.061404,0.069884,0.014925,0,1,0,...,0,1,0,1,1,0,1,0,0,1
1,0.853107,0.172414,0.00000,0.172414,0.010175,0.053126,0.000000,0,0,0,...,0,1,1,0,1,0,0,1,1,0
2,0.016949,0.082759,0.00019,0.082759,0.000000,0.293280,0.014925,0,1,0,...,0,0,1,0,1,0,0,1,0,1
3,0.039548,0.000000,0.00120,0.000000,0.011754,0.576839,0.000000,0,1,0,...,1,0,1,0,1,0,1,0,1,0
4,0.214689,0.200000,0.00000,0.200000,0.017544,0.311379,0.000000,0,1,0,...,0,1,0,1,1,0,0,1,1,0
5,0.033898,0.137931,0.00000,0.137931,0.017544,0.353276,0.000000,0,1,0,...,0,0,0,1,1,0,1,0,1,0
6,0.847458,0.237931,0.00001,0.237931,0.000000,0.208145,0.000000,0,1,0,...,0,0,0,1,1,0,1,0,1,0
7,0.983051,0.241379,0.00204,0.241379,0.008772,0.124351,0.000000,0,1,0,...,0,0,1,0,1,0,0,1,1,0
8,0.858757,0.189655,0.00140,0.189655,0.010175,0.254232,0.000000,0,1,0,...,0,1,1,0,1,0,0,1,1,0
9,0.525424,0.275862,0.00013,0.275862,0.038070,0.074074,0.000000,0,1,0,...,0,0,1,0,1,0,0,1,1,0


# Classification
With the encoded dataset we can now build a classifier. We will define the features and labels (in this case only a binary target variable "Class"). Furthermore many algorithms could be chosen to evaluate the problem, depending on the amount of data, features, linearly/non-linearly seperable. 

In [11]:
features = list(dataset_encoded)
features.remove('Class')
label = 'Class'

In [12]:
data_features = dataset_encoded[features]
data_label = dataset_encoded[label]
assert len(data_features) == len(data_label)

To test the accuracy of our classifier and avoid overfitting the dataset, we will split the data into a training/test set. The test set we choose will be about 20% of the original set and we will train with 80% of the data.

In [13]:
from sklearn.model_selection import train_test_split

# Split our data
train, test, train_labels, test_labels = train_test_split(data_features,
                                                          data_label,
                                                          test_size=0.20)

Since in the task 2 logistic regression is mentioned, we will build, train and evaluate the accuracy of the predicitons made by the trained model

## Logistic Regression Model

In [14]:
from sklearn.linear_model import LogisticRegressionCV
# Initialize our classifier
logreg = LogisticRegressionCV()
# Train our classifier
model = logreg.fit(train, train_labels)
# Make predictions
preds = model.predict(test)
print('Predictions\n',preds)
from sklearn.metrics import accuracy_score
# Evaluate accuracy
print('Accuracy Score',accuracy_score(test_labels, preds))

Predictions
 [0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1

Testing the same training/test sets on a different algorithm

## Support Vector Machine with a linear kernel model

In [15]:
from sklearn.svm import SVC
# Initialize our classifier
svc = SVC(kernel='linear', decision_function_shape='ovr')
# Train our classifier
model = svc.fit(train, train_labels)
# Make predictions
preds = model.predict(test)
print('Predictions\n',preds)
from sklearn.metrics import accuracy_score
# Evaluate accuracy
print('Accuracy Score',accuracy_score(test_labels, preds))

Predictions
 [0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1
 1 0 1 1 1 1

In [16]:
!jupyter nbconvert Task\ 1\ Binary\ Classification\ -\ Classifier.ipynb --to html

[NbConvertApp] Converting notebook Task 1 Binary Classification - Classifier.ipynb to html
[NbConvertApp] Writing 357225 bytes to Task 1 Binary Classification - Classifier.html
