**Setting up Spark Environment in Colab**

In [1]:
import os
spark_version = 'spark-3.4.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [713 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,502 kB]
Get:13 https://ppa.launchpadc

**Uploading CSV Files in Google Colab**

In [2]:
from google.colab import files

# Upload CSV files
uploaded = files.upload()

Saving Application_Data.csv to Application_Data.csv


**Initializing SparkSession for Data Analysis**

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Application_Data").getOrCreate()


In [4]:
# Provide the paths to your CSV files
data_table1_path = 'Application_Data.csv'
# Load CSV data into Spark DataFrames
df1 = spark.read.csv(data_table1_path, header=True, inferSchema=True)

In [5]:
df1.show()

+------------+----------------+---------+------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+------------------+----------------+-----------+-----------+--------------------+--------------------+-------------+----------------+--------------+---------------+------+
|Applicant_ID|Applicant_Gender|Owned_Car|Owned_Realty|Total_Children|Total_Income|         Income_Type|      Education_Type|       Family_Status|        Housing_Type|Owned_Mobile_Phone|Owned_Work_Phone|Owned_Phone|Owned_Email|           Job_Title|Total_Family_Members|Applicant_Age|Years_of_Working|Total_Bad_Debt|Total_Good_Debt|Status|
+------------+----------------+---------+------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+------------------+----------------+-----------+-----------+--------------------+--------------------+-------------+----------------+--------------+----------

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#Converting Spark DataFrame to Pandas DataFrame
credit_Card_approval_df = df1.toPandas()
credit_Card_approval_df.rename(columns={'Status': 'card_approval'}, inplace=True)
credit_Card_approval_df.head()

Unnamed: 0,Applicant_ID,Applicant_Gender,Owned_Car,Owned_Realty,Total_Children,Total_Income,Income_Type,Education_Type,Family_Status,Housing_Type,...,Owned_Work_Phone,Owned_Phone,Owned_Email,Job_Title,Total_Family_Members,Applicant_Age,Years_of_Working,Total_Bad_Debt,Total_Good_Debt,card_approval
0,5008806,M,1,1,0,112500,Working ...,Secondary / secondary special ...,Married ...,House / apartment ...,...,0,0,0,Security staff ...,2,59,4,0,30,1
1,5008808,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,...,0,1,1,Sales staff ...,1,53,9,0,5,1
2,5008809,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,...,0,1,1,Sales staff ...,1,53,9,0,5,1
3,5008810,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,...,0,1,1,Sales staff ...,1,53,9,0,27,1
4,5008811,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,...,0,1,1,Sales staff ...,1,53,9,0,39,1


In [8]:
credit_Card_approval_df.columns

Index(['Applicant_ID', 'Applicant_Gender', 'Owned_Car', 'Owned_Realty',
       'Total_Children', 'Total_Income', 'Income_Type', 'Education_Type',
       'Family_Status', 'Housing_Type', 'Owned_Mobile_Phone',
       'Owned_Work_Phone', 'Owned_Phone', 'Owned_Email', 'Job_Title',
       'Total_Family_Members', 'Applicant_Age', 'Years_of_Working',
       'Total_Bad_Debt', 'Total_Good_Debt', 'card_approval'],
      dtype='object')

**Data Preprocessing and Splitting**

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# credit_Card_approval_df DataFrame
y = credit_Card_approval_df["card_approval"]
X = credit_Card_approval_df[['Applicant_Gender', 'Owned_Car', 'Owned_Realty', 'Total_Children', 'Total_Income', 'Education_Type', 'Family_Status', 'Housing_Type', 'Total_Family_Members', 'Applicant_Age', 'Years_of_Working', 'Total_Bad_Debt', 'Total_Good_Debt', 'Income_Type']]

# Encoding categorical features and scaling
categorical_features = ["Applicant_Gender", "Income_Type", "Education_Type", "Family_Status", "Housing_Type"]
numerical_features = ['Total_Income', 'Applicant_Age', 'Years_of_Working', 'Total_Good_Debt']
one_hot = OneHotEncoder()
scaler = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ("one_hot", one_hot, categorical_features),
        ("scaler", scaler, numerical_features)
    ],
    remainder="passthrough"
)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

**To address the imbalance in a dataset, one approach is to oversample the minority class by generating additional instances using the RandomOverSampler technique.**

In [10]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Create RandomOverSampler instance
oversampler = RandomOverSampler(random_state=1)

# Resample the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

**Logistic Regression Model Performance on Resampled Data**

In [11]:
from sklearn.linear_model import LogisticRegression
# Create a pipeline
model = make_pipeline(
    preprocessor,
    LogisticRegression(solver='lbfgs', random_state=1, max_iter=100)
)
# Fit the model using the training data
model.fit(X_train_resampled,y_train_resampled)
from sklearn.metrics import balanced_accuracy_score

# Assuming X_train_resampled and y_train_resampled are your resampled data
# Predict on the resampled training data
y_pred_resampled = model.predict(X_train_resampled)

# Convert y_pred_resampled to the same data type as y_train_resampled
y_pred_resampled = y_pred_resampled.astype(y_train_resampled.dtype)

# Calculate the balanced accuracy score
balanced_accuracy_resampled = balanced_accuracy_score(y_train_resampled, y_pred_resampled)
print("Balanced Accuracy Score (Resampled):", balanced_accuracy_resampled)
from sklearn.metrics import confusion_matrix, classification_report
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train_resampled, y_pred_resampled)
print("Confusion Matrix:")
print(conf_matrix)
# Calculate the classification report
class_report =classification_report(y_train_resampled, y_pred_resampled)
print("Classification Report:")
print(class_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Balanced Accuracy Score (Resampled): 0.9972258322503249
Confusion Matrix:
[[20006     0]
 [  111 19895]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     20006
           1       1.00      0.99      1.00     20006

    accuracy                           1.00     40012
   macro avg       1.00      1.00      1.00     40012
weighted avg       1.00      1.00      1.00     40012



In [24]:
from imblearn.over_sampling import RandomOverSampler

# # Define the desired ratio of minority to majority samples
desired_ratio = 0.70  # Example: minority class will have 60% of the samples of the majority class

# Create a RandomOverSampler instance with the specified sampling strategy
oversampler = RandomOverSampler(sampling_strategy=desired_ratio)

# Resample the data using the defined sampling strategy
X_resampled, y_resampled = oversampler.fit_resample(X, y)


In [25]:
from sklearn.linear_model import LogisticRegression
# Create a pipeline
model = make_pipeline(
    preprocessor,
    LogisticRegression(solver='lbfgs', random_state=1, max_iter=100)
)
# Fit the model using the training data
model.fit(X_resampled, y_resampled)
from sklearn.metrics import balanced_accuracy_score

# Assuming X_train_resampled and y_train_resampled are your resampled data
# Predict on the resampled training data
y_pred_resampled = model.predict(X_resampled)

# Convert y_pred_resampled to the same data type as y_train_resampled
y_pred_resampled = y_pred_resampled.astype(y_resampled.dtype)

# Calculate the balanced accuracy score
balanced_accuracy_resampled = balanced_accuracy_score(y_resampled, y_pred_resampled)
print("Balanced Accuracy Score (Resampled):", balanced_accuracy_resampled)
from sklearn.metrics import confusion_matrix, classification_report
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_resampled, y_pred_resampled)
print("Confusion Matrix:")
print(conf_matrix)
# Calculate the classification report
class_report =classification_report(y_resampled, y_pred_resampled)
print("Classification Report:")
print(class_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Balanced Accuracy Score (Resampled): 0.997220778182109
Confusion Matrix:
[[22506     0]
 [  139 24868]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     22506
           1       1.00      0.99      1.00     25007

    accuracy                           1.00     47513
   macro avg       1.00      1.00      1.00     47513
weighted avg       1.00      1.00      1.00     47513



**XGBClassifier Model Performance on Resampled Data**

In [28]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
# Define the pipeline
XGB= make_pipeline(
    preprocessor,
    BaggingClassifier(random_state=1)
)

# Fit the pipeline to the resampled training data
XGB.fit(X_train_resampled, y_train_resampled)

# Predictions on the resampled training set
xgb_predictions_train =XGB.predict(X_train_resampled)

# Calculate the balanced accuracy score
balanced_accuracy_train = balanced_accuracy_score(y_train_resampled, xgb_predictions_train)

# Print the balanced accuracy score
print(f"Balanced Accuracy Score on Resampled Training Data: {balanced_accuracy_train}")

# Calculate the confusion matrix
conf_matrix_train = confusion_matrix(y_train_resampled, xgb_predictions_train)

# Print the confusion matrix
print("Confusion Matrix on Resampled Training Data:")
print(conf_matrix_train)

# Calculate the classification report
class_report_train = classification_report(y_train_resampled, xgb_predictions_train)

# Print the classification report
print("Classification Report on Resampled Training Data:")
print(class_report_train)



Balanced Accuracy Score on Resampled Training Data: 1.0
Confusion Matrix on Resampled Training Data:
[[20006     0]
 [    0 20006]]
Classification Report on Resampled Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20006
           1       1.00      1.00      1.00     20006

    accuracy                           1.00     40012
   macro avg       1.00      1.00      1.00     40012
weighted avg       1.00      1.00      1.00     40012



**BaggingClassifier Model Performance on Resampled Data**

In [27]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Define the pipeline
Bagging = make_pipeline(
    preprocessor,
    BaggingClassifier(random_state=1)
)

# Fit the pipeline to the resampled training data
Bagging.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred = Bagging.predict(X_train_resampled)

# Calculate the balanced accuracy score
balanced_accuracy = balanced_accuracy_score(y_train_resampled, y_pred)
print(f"Balanced Accuracy Score: {balanced_accuracy}")

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train_resampled, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate the classification report
class_report = classification_report(y_train_resampled, y_pred)
print("Classification Report:")
print(class_report)


Balanced Accuracy Score: 1.0
Confusion Matrix:
[[20006     0]
 [    0 20006]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20006
           1       1.00      1.00      1.00     20006

    accuracy                           1.00     40012
   macro avg       1.00      1.00      1.00     40012
weighted avg       1.00      1.00      1.00     40012

