In [1]:
#Pandas and numpy are used for data manipulation
import pandas as pd
import numpy as np

#matplotlib.pyplot is used to create plot
import matplotlib.pyplot as plt
%matplotlib inline

#GridSearchCV is a classifier which have important use for the project and also have ability to do cross-validation
from sklearn.model_selection import GridSearchCV

#Decision tree
from sklearn import tree
#LogisticRegression
from sklearn.linear_model import LogisticRegression
#Support Vector Machine
from sklearn import svm

#This shows results of the machine learning algorithm's prediction
from sklearn.metrics import classification_report,confusion_matrix

#train_test_split splits dataset to training and test data and able to randomize the data
from sklearn.model_selection import train_test_split

#Pipeline is an object able developer to do preprocessing and then use the machine learning algorithm
from sklearn.pipeline import Pipeline
#Scaledown the values in the data
from sklearn.preprocessing import StandardScaler

#SimpleImputer is a class which have functions able us to fill missing values in the dataset 
from sklearn.impute import SimpleImputer

In [2]:
#Get the training data 
TrainingData = pd.read_csv("CE802_P2_Data.csv")

##X will contain features in the TrainingData
X = TrainingData.drop('Class',axis=1)
##variable y will contain classes 
y = TrainingData['Class']


In [3]:
#Split the data
#X will be splitted to X_train and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)
#Initiate the imputer to deal with missing value within the dataset
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
#initiate StandardScaler
sc=StandardScaler()
#initiate Decision Tree Classifier
dtree = tree.DecisionTreeClassifier()
#Initiate Pipeline. steps is a parameter indicate
#how preprocessising will work 1) Imputer deals with the missing values 2) Standardscaler 3) use the classifier  
pipe = Pipeline(steps=[('imputer',imp),('scaler',sc), ('classifier', dtree)])
#Paramters which will be used. 
#classifier__criterion will decide either gini criterion ('impurity in the node') or entropy (information gain)
param_grid = {'classifier__criterion':['gini','entropy']}
#Pipe will be used within the GridSearchCV. Fit and prediction will be done according to dtree
dtree_gscv = GridSearchCV(pipe, param_grid, cv=10)
##Fit the model with decision tree
dtree_gscv.fit(X_train,y_train) 
#Make predictions
tree_predictions = dtree_gscv.predict(X_test)
#Print the classification report which shows precision, recall, f1-score for both classes which are false and true
print(classification_report(y_test,tree_predictions))

              precision    recall  f1-score   support

       False       0.82      0.78      0.80       244
        True       0.76      0.80      0.78       206

    accuracy                           0.79       450
   macro avg       0.79      0.79      0.79       450
weighted avg       0.79      0.79      0.79       450



In [4]:
dtree_gscv.best_params_

{'classifier__criterion': 'entropy'}

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)
#Model being used is Support vector classifier which is Suppor Vector Machine for classifiers
model_SVC = svm.SVC()
#initiate StandardScaler
sc = StandardScaler()
#Initiate the imputer to deal with missing value within the dataset
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
#Initiate Pipeline. steps is a parameter indicate
#how preprocessising will work 1) Imputer deals with the missing values 2) Standardscaler 3) use the classifier  
pipe = Pipeline(steps=[('imputer',imp),('scaler',sc),('classifier', model_SVC)])
#param_grid empty means that Support vector classifier will be work with default hyperparameters
param_grid = {} 
grid = GridSearchCV(pipe,param_grid, cv=10)
#Fit the model
grid.fit(X_train,y_train)
#Make the predictions
predictions = grid.predict(X_test)
#show results
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

       False       0.82      0.83      0.82       244
        True       0.79      0.78      0.79       206

    accuracy                           0.81       450
   macro avg       0.81      0.80      0.81       450
weighted avg       0.81      0.81      0.81       450



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)
#Normalize the values in features with StandardScaler
sc = StandardScaler()


#Get the support vector classifier
model_SVC = svm.SVC()
#Initiate the imputer to deal with missing value within the dataset
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
#Initiate Pipeline. steps is a parameter indicate
#how preprocessising will work 1) Imputer deals with the missing values 2) Standardscaler 3) use the classifier  
pipe = Pipeline(steps=[('imputer',imp),('scaler',sc), ('classifier', model_SVC)])
#C is the value that tunes the underfitting vs overfitting trade-off. Gamma is used in gamma radial basis function 
#which is also tunes underfitting vs overfitting trade-off, kernel is the kernel which will be selected
param_grid = {'classifier__C': [0.1,1, 10, 100, 1000], 'classifier__gamma': [1,0.1,0.01,0.001,0.0001], 'classifier__kernel': ['rbf']} 
#Use GridSearchCV: paramter model_SVC is support vector classifier, param_grid are parameters
grid_SVC = GridSearchCV(pipe,param_grid,cv=10)
#Fit the model
grid_SVC.fit(X_train,y_train)
#Make the predictions
grid_predictions = grid_SVC.predict(X_test)
#show results
print(classification_report(y_test,grid_predictions))



              precision    recall  f1-score   support

       False       0.90      0.89      0.90       244
        True       0.88      0.89      0.88       206

    accuracy                           0.89       450
   macro avg       0.89      0.89      0.89       450
weighted avg       0.89      0.89      0.89       450



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35,random_state=86)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
pipe = Pipeline(steps=[('imputer',imp), ('scaler',sc),('classifier', grid_SVC)])

_search_svc = GridSearchCV(pipe, param_grid,cv = 10)

#fit the training data
_search_svc.fit(X_train, y_train)
#make predictions
_search_svc_predictions = _search_svc.predict(X_test)
#show results
print(classification_report(y_test,_search_svc_predictions))

ValueError: Invalid parameter C for estimator GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imputer',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('classifier',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_...
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'classifier__C': [0.1, 1, 10, 100, 1000],
                         'classifier__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'classifier__kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
grid_SVC.best_params_

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)

sc = StandardScaler()
#Logistic regerssion without grid search
logmodel = LogisticRegression()

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
pipe = Pipeline(steps=[('imputer',imp), ('scaler',sc),('classifier', logmodel)])

param_grid = {} 

search_log = GridSearchCV(pipe, param_grid,cv = 10)
#fit the training data
search_log.fit(X_train, y_train)
#make predictions
grid_predictions_log = search_log.predict(X_test)
#show results
print(classification_report(y_test,grid_predictions_log))


              precision    recall  f1-score   support

       False       0.82      0.86      0.84       244
        True       0.82      0.78      0.80       206

    accuracy                           0.82       450
   macro avg       0.82      0.82      0.82       450
weighted avg       0.82      0.82      0.82       450



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
pipe = Pipeline(steps=[('imputer',imp), ('scaler',sc),('classifier', search_log)])

_search_log = GridSearchCV(pipe, param_grid,cv = 10)

#fit the training data
_search_log.fit(X_train, y_train)
#make predictions
grid_predictions_log = _search_log.predict(X_test)
#show results
print(classification_report(y_test,grid_predictions_log))

In [None]:
#Get the testing data
TestingData = pd.read_csv("CE802_P2_Test.csv")

In [None]:
#Create variable which is copy of TestingData
TestingData_original = TestingData.copy()

In [None]:
#Get the test data's features
X_test = TestingData_original.drop('Class',axis=1)
#Get the testing data's "Class" values
y_test = TestingData_original['Class']

In [None]:
#Change values in Target columns in TestingData_orig
TestingData_original.loc[:,"Class"] =  grid_SVC.predict(X_test)

In [None]:
TestingData_original

In [None]:
#Save TestingData_orig dataset to computer
TestingData_original.to_csv('output/CE802_P2_Test.csv')