**Import important libraries**

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

**Load the Data**



In [None]:
# Load the data
data = pd.read_csv('/content/dataset_MLA.csv')

# Display the first few rows of the dataset
data.head()


**EDA(Exploratory Data Analysis)**

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values

In [None]:
# Statistical summary of the dataset
summary = data.describe()
summary

In [None]:
# Set style
sns.set_style("whitegrid")

# Plot distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='Target')
plt.title('Distribution of Target Variable')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()


In [None]:
# Plot distributions of selected features
features = ['Age', 'Balance', 'No_OF_CR_TXNS', 'SCR']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(data[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Plot average balance for each target category
plt.figure(figsize=(8, 6))
sns.barplot(x='Target', y='Balance', data=data, estimator=np.mean, ci=None)
plt.title('Average Balance by Target Category')
plt.xlabel('Target')
plt.ylabel('Average Balance')
plt.show()


In [None]:
# Plot distribution of Gender and Occupation with respect to the target variable
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

# Gender distribution
sns.countplot(x='Gender', hue='Target', data=data, ax=ax[0])
ax[0].set_title('Distribution of Gender by Target')
ax[0].set_xlabel('Gender')
ax[0].set_ylabel('Count')

# Occupation distribution
sns.countplot(x='Occupation', hue='Target', data=data, ax=ax[1])
ax[1].set_title('Distribution of Occupation by Target')
ax[1].set_xlabel('Occupation')
ax[1].set_ylabel('Count')

plt.tight_layout()
plt.show()


**Model Building & Comparision**


In [59]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [41]:
# Drop non-essential columns
data = data.drop(columns=['Cust_ID', 'AGE_BKT'])

# Convert categorical variables to numerical format
le_gender = LabelEncoder()
data['Gender'] = le_gender.fit_transform(data['Gender'])

le_occupation = LabelEncoder()
data['Occupation'] = le_occupation.fit_transform(data['Occupation'])

# Split the data into training and testing sets
X = data.drop(columns=['Target'])
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


1. Non-linear model:

 A.)   **Decision tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Train the Decision Tree model
decision_tree_model = DecisionTreeClassifier(max_depth=5, random_state=42)
decision_tree_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_dt = decision_tree_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_dt = roc_auc_score(y_test, y_pred_prob_dt)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_prob_dt)
confusion_dt = confusion_matrix(y_test, decision_tree_model.predict(X_test))

roc_auc_dt, confusion_dt


In [None]:
# Plot the Decision Tree
plt.figure(figsize=(15, 10))
plot_tree(decision_tree_model, filled=True, feature_names=X.columns, class_names=['Class 0', 'Class 1'])
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Plot the ROC curve for the Decision Tree model
plt.figure(figsize=(10, 7))
plt.plot(fpr_dt, tpr_dt, label=f"Decision Tree (AUC = {roc_auc_dt:.4f})", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Decision Tree Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()




 B.)   **Naive Bayes**



In [None]:
from sklearn.naive_bayes import GaussianNB
# Train the Naive Bayes model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_nb = naive_bayes_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_nb = roc_auc_score(y_test, y_pred_prob_nb)
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_prob_nb)
confusion_nb = confusion_matrix(y_test, naive_bayes_model.predict(X_test))

roc_auc_nb, confusion_nb


In [None]:
# Calculate the ROC curve for the Naive Bayes model
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_prob_nb)
roc_auc_nb = auc(fpr_nb, tpr_nb)

# Plot the ROC curve for the Naive Bayes model
plt.figure(figsize=(10, 7))
plt.plot(fpr_nb, tpr_nb, label=f"Naive Bayes (AUC = {roc_auc_nb:.4f})", color='green')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Naive Bayes Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


 B.)   **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_rf = random_forest_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf)
confusion_rf = confusion_matrix(y_test, random_forest_model.predict(X_test))

roc_auc_rf, confusion_rf


In [None]:
# Calculate the ROC curve for the Random Forest model
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot the ROC curve for the Random Forest model
plt.figure(figsize=(10, 7))
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {roc_auc_rf:.4f})", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel ("True Positive Rate")
plt.title("ROC Curve for Random Forest Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


2. Linear Model


*  A.)  **Logistic regression**


In [None]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_lr = logistic_regression_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_lr = roc_auc_score(y_test, y_pred_prob_lr)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr)
confusion_lr = confusion_matrix(y_test, logistic_regression_model.predict(X_test))

roc_auc_lr, confusion_lr


In [None]:
# Plot the ROC curve for the Logistic Regression model
plt.figure(figsize=(10, 7))
plt.plot(fpr_lr, tpr_lr, label=f"Logistic Regression (AUC = {roc_auc_lr:.4f})", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Logistic Regression Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()





*  B.)  **Support Vector Machine**

In [None]:
from sklearn.svm import SVC

# Train the Support Vector Machine (SVM) model
svm_model = SVC(probability=True, random_state=42)  # Enable probability for ROC curve
svm_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_svm = svm_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_svm = roc_auc_score(y_test, y_pred_prob_svm)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_prob_svm)
confusion_svm = confusion_matrix(y_test, svm_model.predict(X_test))

roc_auc_svm, confusion_svm


In [None]:
# Plot the ROC curve for the Support Vector Machine (SVM) model
plt.figure(figsize=(10, 7))
plt.plot(fpr_svm, tpr_svm, label=f"SVM (AUC = {roc_auc_svm:.4f})", color='green')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Support Vector Machine (SVM) Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


Boosting technique:


*   AdaBoost
*   XGBoost



**1. AdaBoost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Train the AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_prob_ab = adaboost_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score, ROC curve, and confusion matrix
roc_auc_ab = roc_auc_score(y_test, y_pred_prob_ab)
fpr_ab, tpr_ab, _ = roc_curve(y_test, y_pred_prob_ab)
confusion_ab = confusion_matrix(y_test, adaboost_model.predict(X_test))

roc_auc_ab, confusion_ab


In [None]:
# Plot the ROC curve for the AdaBoost model
plt.figure(figsize=(10, 7))
plt.plot(fpr_ab, tpr_ab, label=f"AdaBoost (AUC = {roc_auc_ab:.4f})", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for AdaBoost Model")
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


**2. XGBoost**

In [None]:
import xgboost as xgb

xgboost_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgboost_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = xgboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy


In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix


In [None]:
y_pred_prob = xgboost_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, label=f'AUC: {roc_auc_score(y_test, y_pred_prob):.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [54]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
import xgboost as xgb

In [None]:
# Load and preprocess your dataset (replace with your data loading and preprocessing)
# X, y = load_and_preprocess_data()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the four different machine learning models
models = {
   "decision_tree_model": DecisionTreeClassifier(),
   "naive_bayes_model" : GaussianNB(),
   "random_forest_model" : RandomForestClassifier(),
   "logistic_regression_model": LogisticRegression(max_iter=1000, random_state=42),
   "svm_model" : SVC(probability=True, random_state=42),
   "adaboost_model": AdaBoostClassifier(),
   "xgboost_model": xgb.XGBClassifier()
}

# Initialize dictionaries to store ROC curve data
roc_curves = {}

# Iterate over the models and generate ROC curves
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    roc_curves[model_name] = (fpr, tpr, roc_auc)

# Plot ROC curves for all models
plt.figure(figsize=(10, 7))
for model_name, (fpr, tpr, roc_auc) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f}")

plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import plotly.express as px

# Load and preprocess your dataset (replace with your data loading and preprocessing)
# X, y = load_and_preprocess_data()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the four different machine learning models
models = {
    "decision_tree_model": DecisionTreeClassifier(),
   "naive_bayes_model" : GaussianNB(),
   "random_forest_model" : RandomForestClassifier(),
   "logistic_regression_model": LogisticRegression(max_iter=1000, random_state=42),
   "svm_model" : SVC(probability=True, random_state=42),
   "adaboost_model": AdaBoostClassifier(),
   "xgboost_model": xgb.XGBClassifier()
}

# Initialize dictionaries to store evaluation results
accuracy_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}

# Iterate over the models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy_scores[model_name] = accuracy_score(y_test, y_pred)
    precision_scores[model_name] = precision_score(y_test, y_pred)
    recall_scores[model_name] = recall_score(y_test, y_pred)
    f1_scores[model_name] = f1_score(y_test, y_pred)

# Create a DataFrame to store the evaluation results
results_df = pd.DataFrame({
    "Accuracy": accuracy_scores,
    "Precision": precision_scores,
    "Recall": recall_scores,
    "F1 Score": f1_scores
})

# Transpose the DataFrame to have models on the y-axis
results_df = results_df.T
results_df.columns = models.keys()

# Create an interactive horizontal bar chart with value tooltips
fig = px.bar(results_df, orientation='h', title="Model Performance Comparison")
fig.update_traces(texttemplate='%{x:.2f}', textposition='outside')
fig.update_xaxes(title="Score")
fig.update_yaxes(title="Models")
fig.show()

