### Part A

In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  accuracy_score

Loading Dataset and filling in null values

In [24]:
df = pd.read_csv("CE802_P2_Data.csv")
df['F21'] = df['F21'].fillna(df['F21'].mode()[0])
x_value = df.iloc[:, :-1].values
y_value = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x_value, y_value, test_size=.3, random_state=42)

**Decision Tree Classifier**

Grid Seacrh for cross Validation

In [99]:
params = {
    'criterion':  ['gini', 'entropy'],
    'max_depth':  [None, 2, 4, 6, 8, 10],
    'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],
    'splitter': ['best', 'random']
}

In [100]:
model = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params, cv=10, verbose=1, n_jobs = 5)
model.fit(x_train, y_train)
print(model.best_estimator_)

Fitting 10 folds for each of 168 candidates, totalling 1680 fits
DecisionTreeClassifier(criterion='entropy', max_depth=6)


Apply Decision Tree Classifier

In [102]:
DST_clf = DecisionTreeClassifier(criterion='entropy', max_depth=6, max_features=0.8)
DST_clf.fit(x_train, y_train)
DST_clf_pred = DST_clf.predict(x_test)

Accuracy of Decision Tree Classifier

In [103]:
DST_clf_accuracy = accuracy_score(y_test, DST_clf_pred)
print('Accuracy: ', DST_clf_accuracy)

Accuracy:  0.8566666666666667


**Random Forest Classifier**

Grid Seacrh for cross Validation

In [121]:
parameters = { 
    'n_estimators': [1, 1500],
    'max_depth' : [4,5,6,7,8]
}

In [122]:
model = GridSearchCV(estimator= RandomForestClassifier(), param_grid= parameters,cv=10)
model.fit(x_train, y_train)
print(model.best_estimator_)

RandomForestClassifier(max_depth=8, n_estimators=1500)


Applying Random Forest Classifier

In [105]:
RF_clf  = RandomForestClassifier(max_depth=8, n_estimators = 1500)
RF_clf.fit(x_train, y_train)
RF_clf_pred = RF_clf.predict(x_test)

Accuracy of Random Forest Classifier

In [106]:
RF_clf_accuracy = accuracy_score(y_test, RF_clf_pred)
print('Accuracy: ', RF_clf_accuracy)

Accuracy:  0.83


**KNN Classifier**

Grid Search for cross Validation

In [107]:
parameters = dict(n_neighbors = list(range(1, 20)))

In [108]:
model= GridSearchCV(KNeighborsClassifier(),param_grid= parameters, cv = 10)
model.fit(x_train, y_train)
print(model.best_estimator_)

KNeighborsClassifier(n_neighbors=13)


Applying KNN Classifier

In [110]:
KNN_clf = KNeighborsClassifier(n_neighbors=12)
KNN_clf.fit(x_train, y_train)
KNN_clf_pred = KNN_clf.predict(x_test)

Accuracy KNN Classifier

In [111]:
KNN_clf_accuracy = accuracy_score(y_test, KNN_clf_pred)
print('Accuracy: ', KNN_clf_accuracy)

Accuracy:  0.6466666666666666


**Support Vector Classifier**

Grid Search for cross Validation

In [112]:
parameters = param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

In [113]:
model =  GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
model.fit(x_train, y_train)
print(model.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.507 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.511 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.514 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.507 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.511 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.514 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

Applying Support Vector Classifier

In [7]:
SV_clf  = SVC(C=10, gamma=0.0001)
SV_clf.fit(x_train, y_train)
SV_clf_pred = SV_clf.predict(x_test)

Accuracy SVC

In [8]:
SV_clf_accuracy = accuracy_score(y_test, SV_clf_pred)
print('Accuracy: ', SV_clf_accuracy)

Accuracy:  0.5


**Logestic Regression Classifier**

Applying Grid Search For Cross Validation

In [2]:
parameters = {'max_iter': [20, 50, 100, 200, 500, 1000],'solver': ['newton-cg', 'saga'],'class_weight': ['balanced']}

In [5]:
model_grid = GridSearchCV(estimator= LogisticRegression(random_state=42), param_grid=parameters, verbose=1, cv=10, n_jobs=-1)
model_grid.fit(x_train, y_train)
print(model_grid.best_estimator_)

Fitting 10 folds for each of 12 candidates, totalling 120 fits




LogisticRegression(class_weight='balanced', max_iter=200, random_state=42,
                   solver='newton-cg')


Applying Logistic Regression Classifier

In [9]:
LR_clf  = LogisticRegression(class_weight='balanced', max_iter=200, random_state=42,
                   solver='newton-cg')
LR_clf.fit(x_train, y_train)
LR_clf_pred = LR_clf.predict(x_test)



Accuracy Logistic Regression Classifier

In [10]:
LR_clf_accuracy = accuracy_score(y_test, LR_clf_pred)
print('Accuracy: ', LR_clf_accuracy)

Accuracy:  0.75


**Gaussian Naive Bayes Classifier**

In [12]:
GNB_clf  = GaussianNB()
GNB_clf.fit(x_train, y_train)
GNB_clf_pred = GNB_clf.predict(x_test)

In [13]:
GNB_clf_accuracy = accuracy_score(y_test, GNB_clf_pred)
print('Accuracy: ', GNB_clf_accuracy)

Accuracy:  0.64


Accuracy 

### Part B

In [119]:
test_df = pd.read_csv('CE802_P2_Test.csv')

#Loading the dataset Test
test_data = test_df.iloc[:,:-1].copy()
test_data['F21'] = test_data['F21'].fillna(test_data['F21'].mode()[0])
pred = DST_clf.predict(test_data)

predicted = pred

#Changing last column in dataset
test_df.iloc[:,-1] = predicted

#saving the file as new one
test_df.to_csv('CE802_P2_Test_Predictions.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('CE802_P2_Test.csv').iloc[:,:-1].equals(pd.read_csv('CE802_P2_Test_Predictions.csv').iloc[:,:-1])

