In [None]:
# Importing requisite libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
import os as os
from skimpy import skim

In [None]:
# Setting the working directory

os.chdir("E:\\Final Project\\Other final year project files\\WinPrediction (2)\\Win Prediction")

In [None]:
# List of files

os.listdir()

In [None]:
# Load the dataset

win_pred = pd.read_excel("Win_Prediction_Data.xlsx", engine="openpyxl"); win_pred

In [None]:
# columns and their respective data types

# There are missing values in the column "client category"

win_pred.info()

In [None]:
# skim gives a more comprehensive overview of the dataset, including the percentile, data types, missing values, and their distribution

skim(win_pred)

In [None]:
# Column names

win_pred.columns

In [None]:
# Description of categorical variables

# Since "client category" has missing values, and it is a categorical variable, we'll impute the missing values using mode

win_pred.describe(include="object")

In [None]:
# We're calculating the percentage of missing values in "client category"; the value is 0.78%, which is minute, and values can be easily imputed

(win_pred["Client Category"].isnull().sum()/len(win_pred["Client Category"]))*100

In [None]:
# Count of deal status code

# Around 64% of the deals are shown to be in "lost"; the outcome is slightly imbalanced (or) skewed towards "lost"

win_pred["Deal Status Code"].value_counts()

In [None]:
# Plot for above code

sns.countplot(x="Deal Status Code", data=win_pred)
plt.show()

In [None]:
# Heatmap to display missing values in the dataset

sns.heatmap(win_pred.isnull(), yticklabels = False,cbar = True)

In [None]:
# Looking at null values in the dataset

win_pred.isnull().sum()

In [None]:
# Imputatation of Client Category with mode "Others"

win_pred["Client Category"].fillna("Others", inplace=True); win_pred.isnull().sum()

In [None]:
# Saving the cleaned data to another excel file for EDA

win_pred.to_excel("cleaned_win_pred.xlsx")

In [None]:
# Cross-Tabulation to show how many deals are won (or) lost in each client category

pd.crosstab(win_pred["Client Category"],win_pred["Deal Status Code"])

In [None]:
# Dropping columns "solution type", "solution date", "sector"

win_pred.drop(win_pred.columns[[2,3,4]], axis=1, inplace=True)

In [None]:
# Creating dummy variables

# Primary reason for dummy variables is to convert the categorical variables and represent them in a quantitative way and fit in models

win_pred_dummy = pd.get_dummies(win_pred, drop_first=True)
win_pred_dummy = win_pred_dummy.astype(int); win_pred_dummy

# Support Vector Machine (SVM)

In [None]:
# Splitting the dataset into input and output

# In "x", we're storing only predictor variables; in "y" we're storing target variable

x= win_pred_dummy.drop("Deal Status Code_Won", axis=1)

y = win_pred_dummy.loc[:,"Deal Status Code_Won"]

In [None]:
#Import train and test split module from sklearn

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [None]:
# Creating our support vector machine model (SVC) with hyperparameters

from sklearn.svm import SVC

sig_svc = SVC(kernel = "sigmoid", C = 1.0, random_state= 42, probability=True)

In [None]:
# Training our model

sig_svc.fit(x_train, y_train)

In [None]:
# Support vector of our trained model

sig_svc.support_vectors_

In [None]:
# Length of the support vectors

len(sig_svc.support_vectors_[0])

In [None]:
# Fitting our trained model to our test data 

sig_svc_pred_test = sig_svc.predict(x_test)

In [None]:
# Output

sig_svc_pred_test

In [None]:
# Counting the total of True and False outputs in our test model

np.unique(sig_svc_pred_test, return_counts= True)

In [None]:
# Count of True and False in our original "y" dataset

y_test[0:].value_counts()

In [None]:
# Confusion Matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y_test, sig_svc_pred_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()
plt.show()

# It is predicting more losses than wins (as expected since our dataset was slightly skewed towards "loss" (refer line 10))

In [None]:
# Classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, sig_svc_pred_test))

In [None]:
# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

#Prediction on test data based on the number of thresholds and False Positive Rate and True Positive Rate

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, sig_svc_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.figure()

#Plot false positive and true positive rate and area under the curve value

plt.plot(false_positive_rate, true_positive_rate, label = "Support Vector (area = %0.2f)" % roc_auc)

#single dotted red line

plt.plot([0,1], [0,1], "r--")

plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc = "lower right")
plt.show()

In [None]:
# Accuracy of our test model

from sklearn.metrics import accuracy_score
accuracy_score(y_test, sig_svc_pred_test)

In [None]:
# Radial basis kernel function

rbf_svc = SVC(kernel = "rbf", gamma = 0.7, C=1.0)

In [None]:
# Fitting the model for training data

rbf_svc.fit(x_train,y_train)
rbf_svc.support_vectors_

In [None]:
# Making predictions

rbf_svc_pred_test = rbf_svc.predict(x_test)
rbf_svc_pred_test

In [None]:
# Shows the counts of predicted output

np.unique(rbf_svc_pred_test, return_counts= True)

In [None]:
y_test[0:].value_counts()

In [None]:
# Confustion matrix

from sklearn import metrics
conf_mat_rbf = metrics.confusion_matrix(y_test, rbf_svc_pred_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat_rbf, display_labels = ["0","1"])
cm_display.plot()
plt.show()

In [None]:
# Classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, rbf_svc_pred_test))

In [None]:
# ROC Curve

#Prediction on test data based on the number of thresholds and False Positive Rate and True Positive Rate

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, rbf_svc_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.figure()

#Plot false positive and true positive rate and area under the curve value

plt.plot(false_positive_rate, true_positive_rate, label = "Support Vector (area = %0.2f)" % roc_auc)

#single dotted red line

plt.plot([0,1], [0,1], "r--")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc = "lower right")
plt.show()

In [None]:
# Accuracy of our test model

from sklearn.metrics import accuracy_score
accuracy_score(y_test, rbf_svc_pred_test)

# The accuracy of rbf model is significantly better than sigmoid model

In [None]:
# Creating a copy of the predictor test dataset and creating a new column "Prediction" by fitting the model to the dataset

df = x_test.copy()
df["Prediction"] = rbf_svc_pred_test; df

In [None]:
# Creating a new dataset by joining our original dataset with the prediction column

win = win_pred.join(df.Prediction)

In [None]:
win

In [None]:
win["Prediction"].value_counts()

In [None]:
# Wins

win.loc[win["Prediction"]==1, "Deal Cost"].sum()

In [None]:
# Losses

win.loc[win["Prediction"]==0, "Deal Cost"].sum()

In [None]:
# Subsetting only those deals which are won

deals_won = win.loc[win["Prediction"]==1]

In [None]:
deals_won

In [None]:
# Grouping to see the combo which has done the most successful deals

deals_won.groupby("Prediction")[["VP Name","Manager Name"]].value_counts()

# Oversampling (SVM)

In [None]:
# We can clearly see that around 64% of the predictions are "Lost", and hence the model can be slightly biased towards "Lost" prediction

win_pred["Deal Status Code"].value_counts()

In [None]:
# Creating dummy variables again

win_pred_dummy_2 = pd.get_dummies(win_pred, drop_first=True); win_pred_dummy_2

In [None]:
# Splitting the dataset

win_pred_loss_0 = win_pred_dummy_2[win_pred_dummy_2["Deal Status Code_Won"]==0]
win_pred_wins_1 = win_pred_dummy_2[win_pred_dummy_2["Deal Status Code_Won"]==1]

In [None]:
# Oversampling the minority class by generating synthetic data 

win_pred_wins_1_oversampling = win_pred_wins_1.sample(6306, replace = True)

In [None]:
# Concatenating

win_pred_over_sampled = pd.concat([win_pred_loss_0, win_pred_wins_1_oversampling],axis = 0)

In [None]:
# We can see that the target variable count for both possibilities are equal

win_pred_over_sampled["Deal Status Code_Won"].value_counts()

In [None]:
# Splitting the oversampled data set into input and output

x1 = win_pred_over_sampled.drop("Deal Status Code_Won", axis = 1)
y1 = win_pred_over_sampled.loc[:, "Deal Status Code_Won"]

In [None]:
#Import train and test split from sklearn

from sklearn.model_selection import train_test_split
x1_train, x1_test, y1_train, y1_test = train_test_split(x1,y1,test_size = 0.3, random_state = 42, stratify=y1)

In [None]:
# Creating our model

rbf_svc_2 = SVC(kernel = "rbf", C=1.0, gamma = 0.7)

In [None]:
# Training the model with training data

rbf_svc_2.fit(x1_train, y1_train)

In [None]:
# Fitting the model for testing data

smote_pred_test = rbf_svc_2.predict(x1_test)

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y1_test, smote_pred_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Classification report

from sklearn.metrics import classification_report
print(classification_report(y1_test, smote_pred_test))

In [None]:
# Prediction on test data based on the number of thresholds and False Positive Rate and True Positive Rate

false_positive_rate, true_positive_rate, threshold = roc_curve(y1_test, smote_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.figure()

# Plot false positive and true positive rate and area under the curve value

plt.plot(false_positive_rate, true_positive_rate, label = "Support Vector (area = %0.2f)" % roc_auc)

# Single dotted red line

plt.plot([0,1], [0,1], "r--")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc = "lower right")
plt.show()

In [None]:
# Accuracy of our test model

from sklearn.metrics import accuracy_score
accuracy_score(y1_test, smote_pred_test)

# The accuracy has improved from previous model 

In [None]:
# Creating a copy of the predictor test dataset and creating a new column "Prediction" by fitting the model to the dataset

df_1 = x1_test.copy()
df_1["Prediction"] = rbf_svc_2.predict(x1_test)
df_1

In [None]:
# Creating a new dataset by joining the prediction column with our original dataset

win_1 = win_pred.join(df_1.Prediction)

In [None]:
win_1["Prediction"].value_counts()

In [None]:
# Winnings total

win_1.loc[win_1["Prediction"]==1, "Deal Cost"].sum()

In [None]:
# Total loss

win_1.loc[win_1["Prediction"]==0, "Deal Cost"].sum()

In [None]:
# Subsetting only those deals which are won

deals_won_1 = win_1.loc[win_1["Prediction"]==1]; deals_won_1

In [None]:
# Grouping to see the combo which has done the most successful deals

deals_won_1.groupby("Prediction")[["VP Name","Manager Name"]].value_counts()

# Logistic Regression

In [None]:
# Our original dataset with dummy variables coding

win_pred_dummy

In [None]:
# Splitting the dataset into input and output

# In "x", we're storing only predictor variables; in "y" we're storing target variable

x2= win_pred_dummy.drop("Deal Status Code_Won", axis=1)

y2 = win_pred_dummy.loc[:,"Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.3, random_state=42)

In [None]:
x2_train.shape,x2_test.shape,y2_train.shape,y2_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x2_train,y2_train)

In [None]:
log_reg.intercept_

In [None]:
log_reg.coef_

In [None]:
prediction=log_reg.predict(x2_test)
prediction

In [None]:
log_reg.predict_proba(x2_test)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y2_test,log_reg.predict(x2_test))
fpr, tpr, thresholds = roc_curve(y2_test,log_reg.predict_proba(x2_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
#model evaluation
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y2_test,prediction))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y2_test, prediction)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y2_test, prediction)

# Oversampling (Logistic Regression)

In [None]:
# This is the oversampled dataset that we've already done for SVM

win_pred_over_sampled.shape

In [None]:
# Splitting the oversampled data set into input and output

x3 = win_pred_over_sampled.drop("Deal Status Code_Won", axis = 1)
y3 = win_pred_over_sampled.loc[:, "Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x3_train,x3_test,y3_train,y3_test=train_test_split(x3,y3,test_size=0.3, random_state=42)

In [None]:
x3_train.shape,x3_test.shape,y3_train.shape,y3_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x3_train,y3_train)

In [None]:
log_reg.intercept_

In [None]:
log_reg.coef_

In [None]:
prediction_2=log_reg.predict(x3_test)
prediction_2

In [None]:
log_reg.predict_proba(x3_test)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y3_test,log_reg.predict(x3_test))
fpr, tpr, thresholds = roc_curve(y3_test,log_reg.predict_proba(x3_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
#model evaluation
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y3_test,prediction_2))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y3_test, prediction_2)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our predictions

from sklearn.metrics import accuracy_score
accuracy_score(y3_test,prediction_2)

# XGBoost Algorithm

In [None]:
# Our original dataset with dummy variables coding

win_pred_dummy

In [None]:
# Splitting the dataset into input and output

# In "x", we're storing only predictor variables; in "y" we're storing target variable

x4= win_pred_dummy.drop("Deal Status Code_Won", axis=1)

y4 = win_pred_dummy.loc[:,"Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x4_train,x4_test,y4_train,y4_test=train_test_split(x4,y4,test_size=0.3, random_state=42, stratify=y4)

In [None]:
from xgboost import XGBClassifier

In [None]:
# Creating model with parameters

clf=XGBClassifier(max_depth=7, learning_rate=0.1, n_estimators=1000, objective='binary:logistic', booster='gbtree', subsample=0.8, min_child_weight=1, colsample_bytree=0.8)

In [None]:
print(clf)

In [None]:
# Creating the model on training data

XGB=clf.fit(x4_train,y4_train)
prediction_4=XGB.predict(x4_test)

In [None]:
# Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y4_test, prediction_4))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y4_test, prediction_4)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our predictions

from sklearn.metrics import accuracy_score
accuracy_score(y4_test, prediction_4)

# XGBoost Oversampling

In [None]:
# Oversampled dataset

win_pred_over_sampled.shape

In [None]:
# Splitting the oversampled data set into input and output

x5 = win_pred_over_sampled.drop("Deal Status Code_Won", axis = 1)
y5 = win_pred_over_sampled.loc[:, "Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x5_train,x5_test,y5_train,y5_test=train_test_split(x5,y5,test_size=0.3, random_state=42, stratify=y5)

In [None]:
# Creating model with parameters

clf=XGBClassifier(max_depth=7, learning_rate=0.1, n_estimators=1000, objective='binary:logistic', booster='gbtree', subsample=0.8, min_child_weight=1, colsample_bytree=0.8)

In [None]:
# Creating the model on training data

XGB=clf.fit(x5_train,y5_train)
prediction_5=XGB.predict(x5_test)

In [None]:
#Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y5_test, prediction_5))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y5_test, prediction_5)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our prediction 

from sklearn.metrics import accuracy_score
accuracy_score(y5_test, prediction_5)

# XGBoost K-Cross Validation (original)

In [None]:
# Our original dataset with dummy variables coding

win_pred_dummy

In [None]:
# Splitting the dataset into input and output

# In "x", we're storing only predictor variables; in "y" we're storing target variable

x6= win_pred_dummy.drop("Deal Status Code_Won", axis=1)

y6 = win_pred_dummy.loc[:,"Deal Status Code_Won"]

In [None]:
import xgboost as xgb

In [None]:
# Creating a data matrix "DMatrix"

data_dmatrix = xgb.DMatrix(data=x6, label=y6)

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x6_train,x6_test,y6_train,y6_test=train_test_split(x6,y6,test_size=0.3, random_state=42)

In [None]:
from xgboost import XGBClassifier

params={"objective":'binary:logistic',"alpha": 10,"learning_rate": 1.0,"n_estimators":1000, "max_depth":7}         

In [None]:
# Instantiate the classifier

clf_1 = XGBClassifier(**params)

In [None]:
# Fit the training model

clf_1.fit(x6_train, y6_train)

In [None]:
y_pred = clf_1.predict(x6_test)

In [None]:
# Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y6_test, y_pred))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y6_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our prediction 

from sklearn.metrics import accuracy_score
accuracy_score(y6_test, y_pred)

In [None]:
from xgboost import cv

xgb_cv = cv(dtrain=data_dmatrix, params = params, nfold=6, num_boost_round=1000, early_stopping_rounds=500, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv

# XGBoost K-Cross Validation (Oversampling)

In [None]:
# Splitting the oversampled data set into input and output

x7 = win_pred_over_sampled.drop("Deal Status Code_Won", axis = 1)
y7 = win_pred_over_sampled.loc[:, "Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x7_train,x7_test,y7_train,y7_test=train_test_split(x7,y7,test_size=0.3, random_state=42, stratify=y7)

In [None]:
# Creating a data matrix "DMatrix"

data_dmatrix_2 = xgb.DMatrix(data=x7, label=y7)

In [None]:
from xgboost import XGBClassifier

params_2={"objective":'binary:logistic',"alpha": 10,"learning_rate": 1.0,"n_estimators":1000, "max_depth":7}         

In [None]:
# Instantiate the classifier

clf_2 = XGBClassifier(**params_2)

In [None]:
# Fit the training model

clf_2.fit(x7_train, y7_train)

In [None]:
y_pred_2 = clf_2.predict(x7_test)

In [None]:
# Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y7_test, y_pred_2))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y7_test, y_pred_2)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our prediction 

from sklearn.metrics import accuracy_score
accuracy_score(y7_test, y_pred_2)

In [None]:
from xgboost import cv

xgb_cv_2 = cv(dtrain=data_dmatrix_2, params = params, nfold=7, num_boost_round=1000, early_stopping_rounds=500, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv_2

# Random Forest

In [None]:
# Our original dataset with dummy variables coding

win_pred_dummy

In [None]:
# Splitting the dataset into input and output

# In "x", we're storing only predictor variables; in "y" we're storing target variable

x8= win_pred_dummy.drop("Deal Status Code_Won", axis=1)

y8 = win_pred_dummy.loc[:,"Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x8_train,x8_test,y8_train,y8_test=train_test_split(x8,y8,test_size=0.3, random_state=42)

In [None]:
# Importing random forest classifier to build our model

from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_3 = RandomForestClassifier(n_estimators = 100, random_state=42)

In [None]:
# Fit the model to our training data

clf_3.fit(x8_train, y8_train)

In [None]:
# Make prediction for our test data

pred_3 = clf_3.predict(x8_test)

In [None]:
# Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y8_test, pred_3))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y8_test, pred_3)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our prediction 

from sklearn.metrics import accuracy_score
accuracy_score(y8_test, pred_3)

# Random Forest (Oversampling)

In [None]:
# Splitting the oversampled data set into input and output

x9 = win_pred_over_sampled.drop("Deal Status Code_Won", axis = 1)
y9 = win_pred_over_sampled.loc[:, "Deal Status Code_Won"]

In [None]:
# Splitting the datasets into training and testing input and ouput variables

from sklearn.model_selection import train_test_split
x9_train,x9_test,y9_train,y9_test=train_test_split(x9,y9,test_size=0.3, random_state=42)

In [None]:
clf_4 = RandomForestClassifier(n_estimators = 100, random_state=42)

In [None]:
# Fit the model to our training data

clf_4.fit(x9_train, y9_train)

In [None]:
# Make prediction for our test data

pred_4 = clf_4.predict(x9_test)

In [None]:
# Measuring accuracy on Testing Data

from sklearn import metrics
print(metrics.classification_report(y9_test, pred_4))

In [None]:
# Confusion matrix

from sklearn import metrics
conf_mat = metrics.confusion_matrix(y9_test, pred_4)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels = ["0","1"])
cm_display.plot()

In [None]:
# Accuracy of our prediction 

from sklearn.metrics import accuracy_score
accuracy_score(y9_test, pred_4)

# Conclusions

# 1. The accuracy of SVC radial kernel model oversampling is the highest ~90% followed by random forest oversampling ~89%

# 2. Logistic Regression has performed very poorly

# 3. XGBoost has performed decently, but hyperparameter tuning will have to be done on all ML algorithms to check if the accuracy can be improved

# 4. Another consideration/iteration of the program that can be done is to see if the dropped features can have any impact/improve the accuracy. 