# Input Preprocessing 

In [20]:
import pandas as pd

train = pd.read_csv("training.csv", sep =";", thousands=r',')
valid = pd.read_csv("validation.csv", sep =";", thousands=r',')
print(train.shape)
print(train.columns)
print()
print(valid.shape)
print(valid.columns)

(3700, 19)
Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable18', 'variable19', 'classLabel'],
      dtype='object')

(200, 19)
Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable18', 'variable19', 'classLabel'],
      dtype='object')


In [21]:
#check for missing values and treat that
#but first merge both datasets to apply same preprocessing
data = train.append(valid)
print(data.head(10))

  variable1  variable2   variable3 variable4 variable5 variable6 variable7  \
0         a     1792.0     0.00054         u         g         c         v   
1         b     1692.0     0.00335         y         p         k         v   
2         b     3125.0  1125.00000         u         g        ff        ff   
3         a     4817.0  1335.00000         u         g         i         o   
4         b     3233.0    35.00000         u         g         k         v   
5         a     3483.0   125.00000         y         p         i         h   
6         a     2617.0     0.00020         u         g         j         j   
7         b     2117.0     0.00875         y         p         c         h   
8         b     2892.0     0.00375         u         g         c         v   
9         b     1817.0  1025.00000         u         g         c         h   

   variable8 variable9 variable10  variable11 variable12 variable13  \
0        175         f          t           1          t          g   

In [22]:
print(data.isnull().sum())

variable1       42
variable2       42
variable3        0
variable4       66
variable5       66
variable6       69
variable7       69
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14     103
variable15       0
variable17     103
variable18    2256
variable19       0
classLabel       0
dtype: int64


In [23]:
#Drop variable18 as most are null, filling it will add unrealistic data to the model
data = data.drop(columns='variable18')
print(data.columns)

Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable19', 'classLabel'],
      dtype='object')


In [24]:
#Fill numerical Nans with median of column and discrete ones with most occuring entry

import numpy as np

data['variable1'] = data['variable1'].fillna(data['variable1'].value_counts().index[0])
data['variable2'] = data['variable2'].replace(np.nan, data['variable2'].median())
data['variable3'] = data['variable3'].replace(np.nan, data['variable3'].median())
data['variable4'] = data['variable4'].fillna(data['variable4'].value_counts().index[0])
data['variable5'] = data['variable5'].fillna(data['variable5'].value_counts().index[0])
data['variable6'] = data['variable6'].fillna(data['variable6'].value_counts().index[0])
data['variable7'] = data['variable7'].fillna(data['variable7'].value_counts().index[0])
data['variable8'] = data['variable8'].replace(np.nan, data['variable8'].median())
data['variable11'] = data['variable11'].replace(np.nan, data['variable11'].median())
data['variable14'] = data['variable14'].replace(np.nan, data['variable14'].median())
data['variable15'] = data['variable15'].replace(np.nan, data['variable15'].median())
data['variable17'] = data['variable17'].replace(np.nan, data['variable17'].median())
data['variable19'] = data['variable19'].replace(np.nan, data['variable19'].median())

In [25]:
#Re-check
print(data.isnull().sum())

variable1     0
variable2     0
variable3     0
variable4     0
variable5     0
variable6     0
variable7     0
variable8     0
variable9     0
variable10    0
variable11    0
variable12    0
variable13    0
variable14    0
variable15    0
variable17    0
variable19    0
classLabel    0
dtype: int64


# Label encoding for categorical values

In [26]:
#Extract labels
labels = data.iloc[:, -1]
features = data.iloc[:, :-1]

features = pd.get_dummies(features, columns=['variable1','variable4','variable5','variable6','variable7','variable9','variable10','variable12','variable13'])
print(features.head())

   variable2   variable3  variable8  variable11  variable14  variable15  \
0     1792.0     0.00054        175           1        80.0           5   
1     1692.0     0.00335         29           0       200.0           0   
2     3125.0  1125.00000          0           1        96.0          19   
3     4817.0  1335.00000        335           0         0.0         120   
4     3233.0    35.00000          5           0       232.0           0   

   variable17  variable19  variable1_a  variable1_b  ...  variable7_z  \
0    800000.0           0            1            0  ...            0   
1   2000000.0           0            0            1  ...            0   
2    960000.0           0            0            1  ...            0   
3         0.0           0            1            0  ...            0   
4   2320000.0           0            0            1  ...            0   

   variable9_f  variable9_t  variable10_f  variable10_t  variable12_f  \
0            1            0          

In [27]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
labelList = list(labels)
print(labelList)
labelList = encoder.fit_transform(labelList)
print(labelList)
labels = pd.DataFrame(labelList, columns=['classLabel'])

['no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'no.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.', 'yes.

In [28]:
#Split data back
X_train = features.iloc[:-200, :]
X_valid = features.iloc[3700:3900, :]
Y_train = labels.iloc[:-200]
Y_valid = labels.iloc[3700:3900]

print(X_train.shape)
print(X_valid.shape)
print(Y_train.shape)
print(Y_valid.shape)

(3700, 48)
(200, 48)
(3700, 1)
(200, 1)


In [29]:
#Feature scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_valid = sc_X.transform(X_valid) 

# Binary Classification

In [31]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

Y_pred  = classifier.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200



  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.svm import SVC

classifier = SVC(kernel = 'poly', random_state = 0)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_valid)
cm_SVC = confusion_matrix(Y_valid, y_pred)
acc_SVC = ((cm_SVC[0,0] + cm_SVC[1,1])/200) * 100
print("Accuracy with Support Vector Classifier (SVC) is " , acc_SVC , '%')

In [32]:
#K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 7)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_valid)
print(confusion_matrix(Y_valid , Y_pred))
print(classification_report(Y_valid, Y_pred))

[[53 54]
 [47 46]]
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       107
           1       0.46      0.49      0.48        93

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.50      0.49      0.50       200



  """
