## 1. Importing Libraries and TVS dataset

In [None]:
import pandas as pd              # Importing the pandas library for data manipulation and analysis
import numpy as np               # Importing the numpy library for numerical computations
import matplotlib.pyplot as plt  # Importing the pyplot module from matplotlib for data visualization
import seaborn as sns            # Importing the seaborn library for statistical data visualization


In [None]:
# Reading the CSV file named "TVS.csv" using pandas and storing the data in the variable 'data'
data = pd.read_csv("C:/Users/MY PC/Desktop\DISSERTATION/TVS.csv")

# Displaying the first few rows of the loaded TVS dataset using the 'head()' function
data.head()


## 2. Data Exploration

### 2a. Data Manipulation

In [None]:
# Generating descriptive statistics for the loaded TVS dataset using the 'describe()' function
data.describe()

In [None]:
# Displaying a concise summary of the loaded dataset using the 'info()' function
data.info()

In [None]:
# Retrieving the shape of the TVS dataset using the 'shape' attribute
data.shape

In [None]:
# Counting the occurrences of each unique value in the 'V32' column of the dataset
data.V32.value_counts()

In [None]:
# Calculating the percentage of missing values in each column of the TVS dataset
missing_percent = data.isnull().sum() / len(data) * 100

In [None]:
# Selecting columns with missing value percentage greater than or equal to 30%
missing_columns = missing_percent[missing_percent >= 30]

In [None]:
print(missing_columns)

In [None]:
# Creating a list of column names to be dropped from the TVS dataset
col_to_drop = ['V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'Customer_ID', 'DOB', 'Gender']

# Dropping the specified columns from the 'data' DataFrame
data = data.drop(col_to_drop, axis=1)

# Displaying the remaining column names in the modified TVS dataset
data.columns


In [None]:
# Confirming the selected columns have been dropped
data.shape

In [None]:
# Counting the number of missing values in each column of the TVS dataset
data.isnull().sum()

### 2b. Data Visualization

#### 2b. (i). Univariate Visualization

In [None]:

# Creating a figure with a size of 15x9 inches
plt.figure(figsize=(15,9))

# Plotting a bar chart of the normalized value counts for 'Wheeler_Code'
plt.subplot(221)
data['Wheeler_Code'].value_counts(normalize=True).plot.bar(title='Wheeler_Code')

# Plotting a bar chart of the normalized value counts for 'Employment_Type'
plt.subplot(222)
data['Employment_Type'].value_counts(normalize=True).plot.bar(title='Employment_Type')

# Plotting a bar chart of the normalized value counts for 'Customer_Property_Type'
plt.subplot(223)
data['Customer_Property_Type'].value_counts(normalize=True).plot.bar(title='Customer_Property_Type')

# Plotting a bar chart of the normalized value counts for 'V31'
plt.subplot(224)
data['V31'].value_counts(normalize=True).plot.bar(title='V31')

# Adjusting the spacing between subplots for better readability
plt.tight_layout()

# Displaying the figure with all the subplots
plt.show()


#### 2b. (ii). Outlier identification in Loan Amount using Box plot and Distribution Plot

In [None]:

# Creating a figure with a size of 15x7 inches
plt.figure(figsize=(15, 7))

# Creating the first subplot in a 1x2 grid
plt.subplot(121)

# Plotting a distribution plot (histogram and KDE) for the 'Loan_Amount' column
sns.distplot(data['Loan_Amount'])

# Creating the second subplot in a 1x2 grid
plt.subplot(122)

# Plotting a box plot for the 'Loan_Amount' column
data['Loan_Amount'].plot.box()

# Displaying the figure with the subplots
plt.show()


#### 2b. (iii). Multivariate visualization involving V32

In [None]:

# Creating a figure with a size of 12x6 inches
plt.figure(figsize=(12, 6))

# Plotting a count plot of 'Customer_Property_Type' with hue based on 'V32' of the TVS dataset
sns.countplot(x="Customer_Property_Type", hue="V32", data=data)

# Setting the title of the plot
plt.title("Plot of Customer_Property_Type Against Target Variable")

# Displaying the plot
plt.show()


In [None]:
# Creating a figure with a size of 12x6 inches
plt.figure(figsize=(12, 6))

# Plotting a count plot of 'Employment_Type' with hue based on 'V32'
sns.countplot(x="Employment_Type", hue="V32", data=data)

# Setting the title of the plot
plt.title("Plot of Employment_Type Against Target Variable")

# Displaying the plot
plt.show()


#### 2b. (iv). Correlation Heatmap plot of the available features in the TVS dataset

In [None]:

# Creating a figure with a size of 20x15 inches
plt.figure(figsize=(20, 15))

# Creating a heatmap to visualize the correlation matrix of the 'TVS data'
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')

# Setting the title of the plot
plt.title('Heatmap to show correlation in TVS dataset')

# Displaying the plot
plt.show()

### 2c. Handling Missing Values

#### 2c. (i). mapping categorical values from object to numerical datatypes

In [None]:
# Counting the number of missing values in each column of the TVS dataset
data.isnull().sum()

In [None]:

# Mapping dictionary for converting values in the 'Wheeler_Code' column
conv1 = {"SC": 1, "MO": 2, "MC": 3, "TL": 4, "RETOP": 5}

# Mapping the values in the 'Wheeler_Code' column using the 'conv1' dictionary
data['Wheeler_Code'] = data['Wheeler_Code'].map(conv1)

# Counting the occurrences of each unique value in the 'Wheeler_Code' column
data['Wheeler_Code'].value_counts()


In [None]:

# Mapping dictionary for converting values in the 'Employment_Type' column
conv3 = {"SELF": 1, "SAL": 2, "HOUSEWIFE": 3, "STUDENT": 4, "PENS": 5}

# Mapping the values in the 'Employment_Type' column using the 'conv3' dictionary
data['Employment_Type'] = data['Employment_Type'].map(conv3)

# Counting the occurrences of each unique value in the 'Employment_Type' column
data[['Employment_Type']].value_counts()


In [None]:

# Mapping dictionary for converting values in the 'Customer_Property_Type' column
conv4 = {"OWNED": 1, "RENT": 2, "OWENED BY OFFICE": 3}

# Mapping the values in the 'Customer_Property_Type' column using the 'conv4' dictionary
data['Customer_Property_Type'] = data['Customer_Property_Type'].map(conv4)

# Counting the occurrences of each unique value in the 'Customer_Property_Type' column
data[['Customer_Property_Type']].value_counts()

In [None]:

# Mapping dictionary for converting values in the 'V31' column
conv5 = {"TIER 1": 1, "TIER 2": 2, "TIER 3": 3, "TIER 4": 4}

# Mapping the values in the 'V31' column using the 'conv5' dictionary
data['V31'] = data['V31'].map(conv5)

# Counting the occurrences of each unique value in the 'V31' column
data[['V31']].value_counts()


In [None]:
# Creating an alternative dataset for the missing value handling process
data2 = data

#### 2c. (ii). Replacing missing values with either Mean or Mode using domain knowledge

In [None]:

# Fill missing values in 'Duration_With_TVS' column with mean value
data2['Duration_With_TVS'].fillna(data2['Duration_With_TVS'].mean(), inplace=True)

# Fill missing values in 'No_of_Bounce' column with mode value
data2['No_of_Bounce'].fillna(data2['No_of_Bounce'].mode()[0], inplace=True)

# Fill missing values in 'EMI' column with mean value
data2['EMI'].fillna(data2['EMI'].mean(), inplace=True)

# Fill missing values in 'Loan_Amount' column with mean value
data2['Loan_Amount'].fillna(data2['Loan_Amount'].mean(), inplace=True)

# Fill missing values in 'Tenure' column with mode value
data2['Tenure'].fillna(data2['Tenure'].mode()[0], inplace=True)

# Fill missing values in 'Dealer_Codes_For_Two_Wheeler' column with mode value
data2['Dealer_Codes_For_Two_Wheeler'].fillna(data2['Dealer_Codes_For_Two_Wheeler'].mode()[0], inplace=True)

# Fill missing values in 'Wheeler_Code' column with mode value
data2['Wheeler_Code'].fillna(data2['Wheeler_Code'].mode()[0], inplace=True)

# Fill missing values in 'No_Of_Advance_EMI' column with mode value
data2['No_Of_Advance_EMI'].fillna(data2['No_Of_Advance_EMI'].mode()[0], inplace=True)

# Fill missing values in 'Interest_Rate' column with mean value
data2['Interest_Rate'].fillna(data2['Interest_Rate'].mean(), inplace=True)

# Fill missing values in 'Employment_Type' column with mode value
data2['Employment_Type'].fillna(data2['Employment_Type'].mode()[0], inplace=True)

# Fill missing values in 'Customer_Property_Type' column with mode value
data2['Customer_Property_Type'].fillna(data2['Customer_Property_Type'].mode()[0], inplace=True)

# Fill missing values in 'Age_Before_Loan' column with median value
data2['Age_Before_Loan'].fillna(data2['Age_Before_Loan'].median(), inplace=True)

# Fill missing values in 'V25' column with mean value
data2['V25'].fillna(data2['V25'].mean(), inplace=True)


### 2d. Log Transformation of Loan_Amount feature to eliminate bimodal distribution present

In [None]:

# Applying logarithmic transformation to the 'Loan_Amount' column and save the result in a new column called 'Loan_Amount_log'
data2['Loan_Amount_log'] = np.log(data2['Loan_Amount'])

# Creating a histogram of the 'Loan_Amount_log' column with 8 bins
data2['Loan_Amount_log'].hist(bins=8)

# Displaying the histogram
plt.show()


### 2e. Outlier Management

#### 2e. (i). Identifying outliers in the selected features of the TVS dataset


In [None]:

# Selecting the specified columns from the 'data2' DataFrame and storing them in the 'h' DataFrame
h = data2[['No_Of_Secured_Loans', 'EMI', 'No_Of_Loans', 'No_of_Bounce', 'V25', 'V28', 'V29', 'V30']]

for col in h:
    plt.boxplot(data2[col])  # Creating a box plot for each column in 'h'
    plt.xlabel(col)          # Setting the x-axis label to the column name
    plt.show()               # Displaying the box plot


In [None]:
# Creating an alternative dataset for the outlier handling process
data3 = data2

#### 2e. (ii). Removing outliers in the selected features of the TVS dataset

In [None]:

# Filtering out rows where the value in the 'V28' column is greater than 150
data3 = data3[data3['V28'] <= 150]

# Filtering out rows where the value in the 'V25' column is greater than 230000
data3 = data3[data3['V25'] <= 230000]

# Filtering out rows where the value in the 'V29' column is greater than 80
data3 = data3[data3['V29'] <= 80]

# Filtering out rows where the value in the 'V30' column is greater than 45
data3 = data3[data3['V30'] <= 45]

# Filtering out rows where the value in the 'EMI' column is greater than 7000
data3 = data3[data3['EMI'] <= 7000]

# Filtering out rows where the value in the 'No_Of_Secured_Loans' column is greater than 140
data3 = data3[data3['No_Of_Secured_Loans'] <= 140]

# Filtering out rows where the value in the 'No_Of_Loans' column is greater than 150
data3 = data3[data3['No_Of_Loans'] <= 150]

# Filtering out rows where the value in the 'No_of_Bounce' column is greater than 20
data3 = data3[data3['No_of_Bounce'] <= 20]

#### 2e. (iii). Verify outliers have been handled

In [None]:
j = data3[['No_Of_Secured_Loans','EMI','No_Of_Loans','No_of_Bounce', 'V25', 'V28', 'V29', 'V30']]

for col in j:
    plt.boxplot(data3[col])
    plt.xlabel(col)
    plt.show()

In [None]:
# Creating an alternative dataset for the outlier handling process
data4 = data3

### 2f. Data Normalization

In [None]:

# Import MinMaxScaler library for normalization
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMaxScaler to the 'data4' DataFrame and create a new DataFrame 'data4_normal' with scaled values
data4_normal = pd.DataFrame(scaler.fit_transform(data4), columns=data4.columns)

In [None]:
# Verification of the normalized dataset
data4_normal.head(20)

## 3. Model development

### 3a. Import required libraries for model development and valuation

In [None]:
# Import required libraries for the model development


# Importing cross_val_score for cross-validation
from sklearn.model_selection import cross_val_score

# Importing train_test_split for data splitting
from sklearn.model_selection import train_test_split

# Importing various metrics for model evaluation
from sklearn.metrics import (confusion_matrix, classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

# Importing LogisticRegression for logistic regression models
from sklearn.linear_model import LogisticRegression

# Importing RandomForestClassifier for random forest models
from sklearn.ensemble import RandomForestClassifier

# Importing metrics module for additional evaluation metrics
from sklearn import metrics

# Importing xgboost library for gradient boosting algorithms
import xgboost as xgb

# Importing the SMOTE library for oversampling minority target variables
from imblearn.over_sampling import SMOTE

# Importing RandomSearchCV library for hypertuning the base models
from sklearn.model_selection import RandomizedSearchCV

# Importing XGBClassifier library
from xgboost import XGBClassifier


### 3b. Split normalized TVS dataset into 80% Training and 20% testing and implement SMOTE

In [None]:

# Creating a DataFrame 'X' by dropping the 'V32' column from 'data4_normal'
X = data4_normal.drop('V32', axis=1)

# Creating a Series 'y' with the values from the 'V32' column of 'data4_normal'
y = data4_normal['V32']

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

In [None]:

# Creating an instance of SMOTE
smote = SMOTE()

# Applying SMOTE to the training data (X_train and y_train) to balance the class distribution
X_smote, y_smote = smote.fit_resample(X_train, y_train)

### (i) Logistic Regression Base model

In [None]:

# Creating an instance of LogisticRegression
lr = LogisticRegression()

# Fitting the logistic regression model to the resampled data (X_smote and y_smote)
lr.fit(X_smote, y_smote)

# Predicting the target variable for the test data (X_test) using the trained model
y_predict = lr.predict(X_test)


#### Print Logistic Regression Base Model Evaluation metrics

In [None]:
# Print the recall score
print(recall_score(y_test, y_predict))

# Print the precision score
print(precision_score(y_test, y_predict))

# Print the weighted F1 score
print(f1_score(y_test, y_predict, average='weighted'))


#### Display Logistic Regression Base model ROC/AUC Curve

In [None]:
# Predict the probabilities of the positive class (class 1) using best_model and X_test
y_pred_proba_1 = lr.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba_1
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba_1)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba_1
auc = metrics.roc_auc_score(y_test, y_pred_proba_1)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()

#### Display Logistic Regression Base model confusion matrix

In [None]:
# Computing the confusion matrix using y_test and y_predict
cm_a = confusion_matrix(y_test, y_predict)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm_a, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()

####  Logistic Regression Hypertuned model

In [None]:

# Define the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# Create an instance of RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=lr, param_distributions=param_grid,
                                   scoring='accuracy', n_iter=10, cv=5, random_state=42)

# Perform the random search
random_search.fit(X_smote, y_smote)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions using the best model
y_pred_1 = best_model.predict(X_test)


#### Print Logistic Regression Tuned Model Evaluation metrics

In [None]:

# Print tuned model recall
print(recall_score(y_test, y_pred_1))

# Print tuned model precision score
print(precision_score(y_test, y_pred_1))

# Print tuned model F1-Score
print(f1_score(y_test, y_pred_1, average = 'weighted'))

#### LogisticRegression Hypertuned Model Confusion Matrix

In [None]:
# Computing the confusion matrix using y_test and y_pred_1
cm = confusion_matrix(y_test, y_pred_1)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()

#### LogisticRegression Hypertuned Model ROC Curve

In [None]:

# Predict the probabilities of the positive class (class 1) using best_model and X_test
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba
auc = metrics.roc_auc_score(y_test, y_pred_proba)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()


In [None]:

# Evaluating model performance
class_report = classification_report(y_test, y_pred_1);

print(class_report)

### (ii) RandomForest Base Model

In [None]:

# Creating an instance of RandomForestClassifier with specified hyperparameters
rf = RandomForestClassifier(n_estimators=200, max_depth=2, random_state=0)

# Fitting the random forest model to the resampled data (X_smote and y_smote)
rf.fit(X_smote, y_smote)

# Predicting the target variable for the test data (X_test) using the trained model
y_predict2 = rf.predict(X_test)


#### Print Random Forest Base Model Evaluation metrics

In [None]:

# Print base model recall
print(recall_score(y_test, y_predict2))

# Print base model precision
print(precision_score(y_test, y_predict2))

# Print base model F1-score
print(f1_score(y_test, y_predict2, average = 'weighted'))

#### Display Random Forest Base model ROC/AUC Curve

In [None]:
# Predict the probabilities of the positive class (class 1) using best_model and X_test
y_pred_proba_2 = rf.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba_2)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba
auc = metrics.roc_auc_score(y_test, y_pred_proba_2)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()

#### Display Random Forest Base model ROC/AUC Curve

In [None]:
# Computing the confusion matrix using y_test and y_predict2
cm_b = confusion_matrix(y_test, y_predict2)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm_b, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()

####  RandomForest Hypertuned model

In [None]:

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create an instance of RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                                   scoring='accuracy', n_iter=10, cv=5, random_state=42)

# Perform the random search
random_search.fit(X_smote, y_smote)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model_2 = random_search.best_estimator_

# Make predictions using the best model
y_pred_2 = best_model_2.predict(X_test)


#### Print Random Forest Tuned Model Evaluation metrics

In [None]:

# Print tuned model recall
print(recall_score(y_test, y_pred_2))

# Print tuned model precision score
print(precision_score(y_test, y_pred_2))

# Print tuned model F1-score
print(f1_score(y_test, y_pred_2, average = 'weighted'))

#### RandomForest Hypertuned Model Confusion Matrix

In [None]:
# Computing the confusion matrix using y_test and y_pred_2
cm2 = confusion_matrix(y_test, y_pred_2)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm2, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()


#### RandomForest Hypertuned Model ROC Curve

In [None]:

# Predict the probabilities of the positive class (class 1) using best_model_2 and X_test
y_pred_proba2 = best_model_2.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba2
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba2)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba2
auc = metrics.roc_auc_score(y_test, y_pred_proba2)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()


#### RandomForest Hypertuned Model Classification Report

In [None]:

# Evaluating RandomForest Hypertuned Model Performance
class_report_2 = classification_report(y_test, y_pred_2);

print(class_report_2)

### (iii) Extreme Gradient Boosting (XGBoost) Base Model

In [None]:

# Creating an instance of XGBClassifier with specified hyperparameters
clf = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=500,
                    objective='binary:logistic', booster='gbtree')

# Fitting the XGBoost model to the resampled data (X_smote and y_smote)
clf.fit(X_smote, y_smote)

# Predicting the target variable for the test data (X_test) using the trained model
y_predict3 = clf.predict(X_test)

#### Print XGBoost Base Model Evaluation metrics

In [None]:

# Print base model recall
print(recall_score(y_test, y_predict3))

# Print base model precision score
print(precision_score(y_test, y_predict3))

# Print base model F1-Score
print(f1_score(y_test, y_predict3, average = 'weighted'))

#### Display XGBoost Base Model ROC/AUC Score

In [None]:
# Predict the probabilities of the positive class (class 1) using best_model and X_test
y_pred_proba_3 = clf.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba_3)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba
auc = metrics.roc_auc_score(y_test, y_pred_proba_3)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()

#### Display XGBoost Base Model Confusion matrix

In [None]:
# Computing the confusion matrix using y_test and y_predict3
cm_c = confusion_matrix(y_test, y_predict3)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm_c, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()

####  XGBoost Hypertuned model

In [None]:

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Create an instance of RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid,
                                   scoring='accuracy', n_iter=10, cv=5, random_state=42)

# Perform the random search
random_search.fit(X_smote, y_smote)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model_3 = random_search.best_estimator_

# Make predictions using the best model
y_pred_3 = best_model_3.predict(X_test)


#### Display XGBoost Tuned Model Evaluation metrics

In [None]:

# Print tuned model recall
print(recall_score(y_test, y_pred_3))

# Print tuned model precision
print(precision_score(y_test, y_pred_3))

# Print tuned model F1-Score
print(f1_score(y_test, y_pred_3, average = 'weighted'))

#### XGBoost Hypertuned Model Confusion Matrix

In [None]:

# Computing the confusion matrix using y_test and y_pred_3
cm3 = confusion_matrix(y_test, y_pred_3)

# Creating a heatmap of the confusion matrix
sns.heatmap(cm3, cmap="Blues", annot=True, xticklabels=[0, 1], yticklabels=[0, 1])

# Setting the title of the plot
plt.title("Confusion Matrix Plot")

# Setting the x-axis label
plt.xlabel("Predicted")

# Setting the y-axis label
plt.ylabel("Actual")

# Displaying the plot
plt.show()


#### XGBoost Hypertuned Model ROC Curve

In [None]:

# Predict the probabilities of the positive class (class 1) using best_model_3 and X_test
y_pred_proba3 = best_model_3.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds using y_test and y_pred_proba3
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba3)

# Compute the area under the ROC curve (AUC) using y_test and y_pred_proba3
auc = metrics.roc_auc_score(y_test, y_pred_proba3)

# Create ROC curve

# Plot the ROC curve with false positive rate (x-axis) and true positive rate (y-axis)
plt.plot(fpr, tpr, label="AUC=" + str(auc))

# Set the y-axis label
plt.ylabel("True Positive Rate")

# Set the x-axis label
plt.xlabel("False Positive Rate")

# Add a legend to the plot at the lower-right corner
plt.legend(loc=4)

# Display the plot
plt.show()


#### XGBoost Hypertuned Model Classification Report

In [None]:

# Evaluating XGBoost hypertuned model performance
class_report_3 = classification_report(y_test, y_pred_3);

print(class_report_3)

### 3c. Feature Importance Plot of Most Efficient model

In [None]:

# Get the feature importances from the random forest base model
importances = rf.feature_importances_


In [None]:
# Create a DataFrame to store feature names and importances
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": importances})

# Sort features by importance in descending order
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 2))

plt.bar(feature_importances["Feature"], feature_importances["Importance"])

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Set the title of the plot
plt.title("Feature Importances")

# Set the x-axis label
plt.xlabel("Feature")

# Set the y-axis label
plt.ylabel("Importance")

plt.show()


#### Tuned logisticregression feature importance

In [None]:

# Retrieve the coefficients or feature importances
feature_importance = best_model.coef_[0]

# Get the corresponding feature names
feature_names = data4_normal.columns

# Sort the feature importance in descending order
sorted_indices = feature_importance.argsort()[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]


# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importance)), sorted_feature_importance)
plt.xticks(range(len(feature_importance)), sorted_feature_names, rotation='vertical')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance - Logistic Regression')
plt.tight_layout()
plt.show()


#### Tuned xgboost feature importance

In [None]:
# Get the feature importances from the random forest base model
importances_xg_tuned = best_model_3.feature_importances_

# Create a DataFrame to store feature names and importances
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": importances_xg_tuned})

# Sort features by importance in descending order
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 2))

plt.bar(feature_importances["Feature"], feature_importances["Importance"])

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Set the title of the plot
plt.title("Feature Importances")

# Set the x-axis label
plt.xlabel("Feature")

# Set the y-axis label
plt.ylabel("Importance")

plt.show()