# Importing the Required Libraries

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


## Reading the dataset

In [2]:
df = pd.read_csv(r"C:\Users\sehri\Desktop\Machine Learning\Projects\Datasets\credit_card.csv")
df.head(3)

Unnamed: 0,Client_Num,Card_Category,Annual_Fees,Activation_30_Days,Customer_Acq_Cost,Week_Start_Date,Week_Num,Qtr,current_year,Credit_Limit,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Vol,Avg_Utilization_Ratio,Use Chip,Exp Type,Interest_Earned,Delinquent_Acc
0,708082083,Blue,200,0,87,01-01-2023,Week-1,Q1,2023,3544.0,1661,15149,111,0.469,Chip,Travel,4393.21,0
1,708083283,Blue,445,1,108,01-01-2023,Week-1,Q1,2023,3421.0,2517,992,21,0.736,Swipe,Entertainment,69.44,0
2,708084558,Blue,140,0,106,01-01-2023,Week-1,Q1,2023,8258.0,1771,1447,23,0.214,Chip,Bills,202.58,0


In [4]:
df.rename(columns  = {'Use Chip' : 'Use_Chip', 'Exp Type' : 'Exp_Type'}, inplace = True)


## Cleaning and Manipulating Data

In [5]:
# df.drop('Client_Num', axis = 1, inplace = True)

# Converting the Week_Start_Date into datetime dtype

df['Week_Start_Date'] = pd.to_datetime(df['Week_Start_Date'] , format = ('%d-%m-%Y'))
df['Week_Day'] = df['Week_Start_Date'].dt.day
df['Week_Month'] = df['Week_Start_Date'].dt.month
df.drop('Week_Start_Date', axis = 1, inplace = True)





In [6]:
# Handling the 'Week_Num' column

df['Week_Num'] = df['Week_Num'].str.extract(r'Week-(\d+)')

# Handling the Qtr Column

df['Qtr'] = df['Qtr'].str.extract(r'Q(\d)')



## Encoding

In [7]:
# Applying OneHotEncoding on Categorical Columns

categorical = ['Card_Category', 'Use_Chip', 'Exp_Type']

encoder = OneHotEncoder(drop = None , sparse_output = False )
encoded = encoder.fit_transform(df[categorical])
data = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(categorical))

df = df.drop(columns = categorical).reset_index(drop = True)
df = pd.concat([df, data], axis = 1)





## Feature Selection

In [27]:
x = df.drop('Delinquent_Acc', axis = 1)
y = df['Delinquent_Acc']

scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

estimator = LogisticRegression(random_state = 42)

selector = SequentialFeatureSelector(estimator, direction = 'backward', cv = 5, n_jobs = -1 )
selector.fit(scaled_x, y)



In [28]:
selected_mask = selector.get_support()
selected_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [31]:
selected_features = x.columns[selected_mask]
selected_features


Index(['Week_Month', 'Card_Category_Blue', 'Card_Category_Gold',
       'Card_Category_Platinum', 'Card_Category_Silver', 'Use_Chip_Chip ',
       'Use_Chip_Online ', 'Use_Chip_Swipe ', 'Exp_Type_Bills',
       'Exp_Type_Entertainment', 'Exp_Type_Food', 'Exp_Type_Fuel',
       'Exp_Type_Grocery', 'Exp_Type_Travel'],
      dtype='object')

## Train Test Split

In [43]:
x = df[selected_features]
y = df['Delinquent_Acc']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.4, random_state = 42)

 

In [44]:
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],                # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],                # Max depth of each tree
    'min_samples_split': [2, 5, 10],                # Min samples required to split a node
    'min_samples_leaf': [1, 2, 4],                  # Min samples required at a leaf node
    'max_features': ['sqrt', 'log2'],               # Number of features to consider when looking for best split
    'bootstrap': [True, False],                     # Whether bootstrap samples are used
    'criterion': ['gini', 'entropy']                # Function to measure the quality of a split
}

grid = GridSearchCV(model, param_grid, n_jobs = -1, cv = 5, error_score = 'raise' , scoring = 'accuracy', verbose = 2) 

grid.fit(train_x, train_y)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


In [47]:
best_params = grid.best_params_
best_score = grid.best_score_

print(f'Best Parameters are {best_params}')
print(f'Best Score is {best_score}')

best_estimator = grid.best_estimator_
print(f'Best Model is {best_estimator}')

pred = best_estimator.predict(test_x)

score  = accuracy_score(test_y, pred)
print(f'Accuracy on Test Data: {score:.4f}')




Best Parameters are {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best Score is 0.940798255423234
Best Model is RandomForestClassifier(min_samples_split=10)
Accuracy on Test Data: 0.9369
