In [36]:
import pandas as pd
import numpy as np

# Our Business Use case is a Classification Problem Statement
# Importing necessary evaluation metric modules
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

In [37]:
# Loading the clean and scaled data into our dataframes
X_train = pd.read_csv("xtrain.csv")
X_test = pd.read_csv("xtest.csv")
y_train = pd.read_csv("ytrain.csv")
y_test = pd.read_csv("ytest.csv")

## Model Building

### Logistic Regression

In [38]:
# Importing Logistic Regression package
from sklearn.linear_model import LogisticRegression

# Creating LogisticRegression Class Object and fitting our Training Instances
log_clf = LogisticRegression(penalty= 'l2') # Using Ridge Regularization to penalize colinear and high parameter features
log_clf.fit(X_train, y_train)

# Using trained model to predict our Training Data
y_pred_log = log_clf.predict(X_test)

# Evaluating the performance of our prediction using Logistic Regression
log_clf_acc = accuracy_score(y_test, y_pred_log)
log_clf_precision = precision_score(y_test, y_pred_log)
log_clf_recall = recall_score(y_test, y_pred_log)
log_clf_f1 = f1_score(y_test, y_pred_log)

# Since we will using multiple models, we will create a Dataframe to store each models performance

performance = pd.DataFrame(data = [['Lasso Model', log_clf_acc, log_clf_precision, log_clf_recall, log_clf_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
performance

  return f(*args, **kwargs)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Lasso Model,0.570631,0.58312,0.709544,0.64015


### Support Vector Classifier

In [39]:
# Importing Support Vector Classfier package
from sklearn.svm import SVC

# Creating Support Vector Class Object and fitting our Training Instances
svc_clf = SVC(kernel= 'linear') # Using linear Kernel to project data into higher dimension to create best fitted hyperplane
svc_clf.fit(X_train, y_train)

# Using trained model to predict our Training Data
y_pred_svc = svc_clf.predict(X_test)

# Evaluating the performance of our prediction using Logistic Regression
svc_clf_acc = accuracy_score(y_test, y_pred_svc)
svc_clf_precision = precision_score(y_test, y_pred_svc)
svc_clf_recall = recall_score(y_test, y_pred_svc)
svc_clf_f1 = f1_score(y_test, y_pred_svc)

# Since we will using multiple models, we will create a Dataframe to store each models performance

performance = performance.append(pd.DataFrame(data = [['Support Vector Classifier (Linear)', svc_clf_acc, svc_clf_precision, svc_clf_recall, svc_clf_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']))
performance

  return f(*args, **kwargs)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Lasso Model,0.570631,0.58312,0.709544,0.64015
0,Support Vector Classifier (Linear),0.573143,0.583299,0.724585,0.64631


#### Using SVC rbf kernel to evalute model performance

In [40]:
# Creating LogisticRegression Class Object and fitting our Training Instances
svc_rbf_clf = SVC(kernel= 'rbf') # Using linear Kernel to project data into higher dimension to create best fitted hyperplane
svc_rbf_clf.fit(X_train, y_train)

# Using trained model to predict our Training Data
y_pred_svc_rbf = svc_rbf_clf.predict(X_test)

# Evaluating the performance of our prediction using Logistic Regression
svc_rbf_clf_acc = accuracy_score(y_test, y_pred_svc_rbf)
svc_rbf_clf_precision = precision_score(y_test, y_pred_svc_rbf)
svc_rbf_clf_recall = recall_score(y_test, y_pred_svc_rbf)
svc_rbf_clf_f1 = f1_score(y_test, y_pred_svc_rbf)

# Since we will using multiple models, we will create a Dataframe to store each models performance

performance = performance.append(pd.DataFrame(data = [['Support Vector Classifier (RBF)', svc_rbf_clf_acc, svc_rbf_clf_precision, svc_rbf_clf_recall, svc_rbf_clf_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']))
performance

  return f(*args, **kwargs)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Lasso Model,0.570631,0.58312,0.709544,0.64015
0,Support Vector Classifier (Linear),0.573143,0.583299,0.724585,0.64631
0,Support Vector Classifier (RBF),0.608878,0.621485,0.69917,0.658042


### Random Forest Classifier

In [41]:
# Importing Random Forest Classfier package
from sklearn.ensemble import RandomForestClassifier

# Creating Random Forest Class Object and fitting our Training Instances
# We will be creating 100 decision trees and criterion as Entropy for Random Forest
rand_clf = RandomForestClassifier(n_estimators= 100, criterion= 'entropy')
rand_clf.fit(X_train, y_train)

# Using trained model to predict our Training Data
y_pred_rand = rand_clf.predict(X_test)

# Evaluating the performance of our prediction using Logistic Regression
rand_clf_acc = accuracy_score(y_test, y_pred_rand)
rand_clf_precision = precision_score(y_test, y_pred_rand)
rand_clf_recall = recall_score(y_test, y_pred_rand)
rand_clf_f1 = f1_score(y_test, y_pred_rand)

# Since we will using multiple models, we will create a Dataframe to store each models performance

performance = performance.append(pd.DataFrame(data = [['Random Forest Classifier (n = 100)', rand_clf_acc, rand_clf_precision, rand_clf_recall, rand_clf_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']))
performance

  import sys


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Lasso Model,0.570631,0.58312,0.709544,0.64015
0,Support Vector Classifier (Linear),0.573143,0.583299,0.724585,0.64631
0,Support Vector Classifier (RBF),0.608878,0.621485,0.69917,0.658042
0,Random Forest Classifier (n = 100),0.620882,0.639432,0.677905,0.658107


#### From the above comparison we can see Random Forest is performing better than the other classifier models
#### We will use K Folder Cross Validation on Random Forest for further Model Building

In [43]:
# Importing K Fold Cross Validation
from sklearn.model_selection import cross_val_score

# Creating Object with 10 K folder iteration
accuracies = cross_val_score(estimator= rand_clf, X = X_train, y = y_train, cv = 10)

# Printing the average score and Standard deviation for each cross validation performed
print(f"Mean Accuracy : {accuracies.mean()} with Standard deviation of (+/-){accuracies.std()*2}")

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Mean Accuracy : 0.6297668777070411 with Standard deviation of (+/-)0.02381602217080276


## From the Cross Validation we can see Random Forest performs best amoung the selected Other Classifier Algorithms and hence we will use Random Forest for our model building.