In [19]:
#Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
#Load the csv dataset and save it as a datafram. using head() print the first 5 rows values
df=pd.read_csv('credit.csv')
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [21]:
#Drop the unwanted columns in the dataset
df.drop(columns=['ID','Customer_ID','Name','SSN','Month'],inplace=True)

In [22]:
#To reduce reduntancy these features can be dropped while creating the model
#Monthly_Inhand_Salary: Despite its high correlation with Amount_invested_monthly, it shows only a weak correlation with Credit_Score. 
#Credit_Mix: It has a relatively weak correlation with Credit_Score and is strongly correlated with Num_Credit_Inquiries (0.66).
df.drop(columns=['Monthly_Inhand_Salary','Credit_Mix'],inplace=True)

In [23]:
#Preprocessing Type_of_Loan column 
# Exclude "No Data" and "Not Specified" from consideration
filtered_df = df[~df['Type_of_Loan'].isin(["No Data", "Not Specified"])]

# Get the top 50 most frequent loan types
top_70_loan_types = filtered_df['Type_of_Loan'].value_counts().head(70).index

# Replace non-top 50 loan types with "Other"
df['Type_of_Loan'] = df['Type_of_Loan'].apply(lambda x: x if x in top_70_loan_types else 'Other')
# Print the updated 'Type_of_Loan' column value counts to verify
updated_loan_type_counts = df['Type_of_Loan'].value_counts()
updated_loan_type_counts

Type_of_Loan
Other                                         77424
Credit-Builder Loan                            1280
Personal Loan                                  1272
Debt Consolidation Loan                        1264
Student Loan                                   1240
                                              ...  
Payday Loan, and Home Equity Loan               168
Mortgage Loan, and Payday Loan                  168
Credit-Builder Loan, and Personal Loan          168
Auto Loan, and Not Specified                    168
Debt Consolidation Loan, and Mortgage Loan      168
Name: count, Length: 71, dtype: int64

In [24]:
# Calculate the mean Age for each Occupation
mean_age_by_occupation = df.groupby('Occupation')['Age'].mean()
# Update Age values less than 18 with the mean age for their respective Occupation as the legal age to apply for the credit card or loan is 18.
df.loc[df['Age'] < 18, 'Age'] = df.loc[df['Age'] < 18, 'Occupation'].map(mean_age_by_occupation)

In [25]:
df.head(20)

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,7.0,...,4.0,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,4.0,...,4.0,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,7.0,...,4.0,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,5.0,4.0,...,4.0,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,6.0,4.0,...,4.0,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good
5,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,8.0,4.0,...,4.0,809.98,27.262259,270.0,No,49.574949,21.46538,High_spent_Medium_value_payments,340.479212,Good
6,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,8.0,...,4.0,809.98,22.537593,271.0,No,49.574949,21.46538,Low_spent_Small_value_payments,244.565317,Good
7,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,6.0,...,4.0,809.98,23.933795,272.0,No,49.574949,21.46538,High_spent_Medium_value_payments,358.124168,Standard
8,28.0,Teacher,34847.84,2.0,4.0,6.0,1.0,Credit-Builder Loan,3.0,4.0,...,2.0,605.03,24.464031,319.0,No,18.816215,39.684018,Low_spent_Small_value_payments,470.690627,Standard
9,28.0,Teacher,34847.84,2.0,4.0,6.0,1.0,Credit-Builder Loan,7.0,1.0,...,2.0,605.03,38.550848,320.0,No,18.816215,39.684018,High_spent_Large_value_payments,484.591214,Good


In [26]:
df.shape

(100000, 21)

In [27]:
df.to_csv('credit_preprocessed.csv', index=False)

In [28]:
data=pd.read_csv('credit_preprocessed.csv')
data.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,7.0,...,4.0,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,4.0,...,4.0,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,3.0,7.0,...,4.0,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,5.0,4.0,...,4.0,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,23.0,Scientist,19114.12,3.0,4.0,3.0,4.0,Other,6.0,4.0,...,4.0,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [29]:
data.columns

Index(['Age', 'Occupation', 'Annual_Income', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [30]:
from sklearn.preprocessing import StandardScaler,TargetEncoder,FunctionTransformer,LabelEncoder
from category_encoders import BinaryEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,accuracy_score,f1_score,recall_score

In [32]:
#Apply Binary Encoding to 'Occupation' and 'Payment_Behaviour'
binary_encoder = BinaryEncoder(cols=['Occupation', 'Payment_Behaviour','Payment_of_Min_Amount'], drop_invariant=True)
data_encoded = binary_encoder.fit_transform(data)

# Apply Target Encoding to 'Type_of_Loan'
target_encoder = TargetEncoder()
data_encoded['Type_of_Loan'] = target_encoder.fit_transform(data[['Type_of_Loan']],data['Credit_Score'])

# List the columns you want to apply log transformation to
selected_columns = ['Annual_Income', 'Delay_from_due_date','Changed_Credit_Limit','Credit_Utilization_Ratio','Outstanding_Debt','Total_EMI_per_month','Amount_invested_monthly', 'Monthly_Balance']  # Replace with your column names

# Create a function for log transformation
log_transformer = FunctionTransformer(lambda x: np.log1p(x))

# Apply log transformation to the selected columns
data_encoded[selected_columns] = log_transformer.fit_transform(data_encoded[selected_columns])

# Display the transformed data
data_encoded.head()

Unnamed: 0,Age,Occupation_0,Occupation_1,Occupation_2,Occupation_3,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Credit_History_Age,Payment_of_Min_Amount_0,Payment_of_Min_Amount_1,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour_0,Payment_Behaviour_1,Payment_Behaviour_2,Monthly_Balance,Credit_Score
0,23.0,0,0,0,1,9.858235,3.0,4.0,3.0,4.0,...,265.0,0,1,3.923456,3.111975,0,0,1,5.747781,Good
1,23.0,0,0,0,1,9.858235,3.0,4.0,3.0,4.0,...,266.0,0,1,3.923456,3.111975,0,1,0,5.654694,Good
2,23.0,0,0,0,1,9.858235,3.0,4.0,3.0,4.0,...,267.0,0,1,3.923456,3.111975,0,1,1,5.805767,Good
3,23.0,0,0,0,1,9.858235,3.0,4.0,3.0,4.0,...,268.0,0,1,3.923456,3.111975,1,0,0,5.413659,Good
4,23.0,0,0,0,1,9.858235,3.0,4.0,3.0,4.0,...,269.0,0,1,3.923456,3.111975,1,0,1,5.83624,Good


In [33]:
# Step 1: Separate features and target
X = data_encoded.drop('Credit_Score', axis=1)  # Features
y = data_encoded['Credit_Score']               # Target variable

#Apply Label Encoding to Credit_Score
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Credit_Score'])

# Step 2: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Further split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Check the sizes of the splits
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_val: {X_val.shape}, y_val: {y_val.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (60000, 26), y_train: (60000,)
X_val: (20000, 26), y_val: (20000,)
X_test: (20000, 26), y_test: (20000,)


In [34]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and validation/test sets
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on the training set
X_val_scaled = scaler.transform(X_val)          # Only transform the validation set
X_test_scaled = scaler.transform(X_test)        # Only transform the test set


In [35]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [17]:
# List of classifiers to evaluate the model selection
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Classifier": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [36]:
# Empty dictionary to store results
results = {}

# Train each model and evaluate
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    
   # Evaluate using accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    
    # Save results
    results[name] = {
        'Accuracy': accuracy, 
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall
    }

# Print the results for each model
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy = {metrics['Accuracy']:.4f}, F1 Score = {metrics['F1 Score']:.4f}, Precision = {metrics['Precision']:.4f}, Recall = {metrics['Recall']:.4f}")


Parameters: { "use_label_encoder" } are not used.



Logistic Regression: Accuracy = 0.6445, F1 Score = 0.6382, Precision = 0.6408, Recall = 0.6445
Decision Tree: Accuracy = 0.7189, F1 Score = 0.7189, Precision = 0.7189, Recall = 0.7189
Random Forest: Accuracy = 0.7942, F1 Score = 0.7940, Precision = 0.7940, Recall = 0.7942
Gradient Boosting: Accuracy = 0.6979, F1 Score = 0.6948, Precision = 0.6973, Recall = 0.6979
Support Vector Classifier: Accuracy = 0.6945, F1 Score = 0.6908, Precision = 0.6932, Recall = 0.6945
XGBoost: Accuracy = 0.7565, F1 Score = 0.7557, Precision = 0.7556, Recall = 0.7565
K-Nearest Neighbors: Accuracy = 0.6797, F1 Score = 0.6804, Precision = 0.6813, Recall = 0.6797


The Random Forest Classification shows maximum accuracy of 0.7942, thus predictive modelling can be done with the same algorithm