# **Importing Libraries**

In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score


import warnings

warnings.filterwarnings("ignore", category=ImportWarning, module='specific_module')
# Code that might trigger the warning for specific_module


In [30]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Importing dataset**

In [32]:
# Load your dataset
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Credit Card Score/Credit-Score-Data.csv')
df

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,26.822620,265,No,49.574949,21.465380,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.944960,266,No,49.574949,21.465380,Low_spent_Large_value_payments,284.629163,Good
2,5636,3392,3,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,28.609352,267,No,49.574949,21.465380,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.377862,268,No,49.574949,21.465380,Low_spent_Small_value_payments,223.451310,Good
4,5638,3392,5,Aaron Maashoh,23,821000265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,269,No,49.574949,21.465380,High_spent_Medium_value_payments,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,155625,37932,4,Nicks,25,78735990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,34.663572,378,No,35.104023,24.028477,High_spent_Large_value_payments,479.866228,Poor
99996,155626,37932,5,Nicks,25,78735990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,40.565631,379,No,35.104023,24.028477,High_spent_Medium_value_payments,496.651610,Poor
99997,155627,37932,6,Nicks,25,78735990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,41.255522,380,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor
99998,155628,37932,7,Nicks,25,78735990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,33.638208,381,No,35.104023,24.028477,Low_spent_Large_value_payments,319.164979,Standard


In [33]:
df.shape

(100000, 28)

# **Data pre-processing**

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  int64  
 5   SSN                       100000 non-null  int64  
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  int64  
 13  Type_of_Loan              100000 non-null  ob

In [35]:
#Checking for null values
df.isna().sum()

ID                          0
Customer_ID                 0
Month                       0
Name                        0
Age                         0
SSN                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64

In [36]:
#Checking for duplicated
df.duplicated().sum()

0

In [37]:
# Value count for each value
for i in df.columns:
    print(i,'\n',df[i].value_counts())
    print('-'*90)

ID 
 5634      1
105608    1
105642    1
105637    1
105636    1
         ..
55629     1
55628     1
55627     1
55626     1
155629    1
Name: ID, Length: 100000, dtype: int64
------------------------------------------------------------------------------------------
Customer_ID 
 3392     8
39924    8
23267    8
48794    8
18548    8
        ..
11956    8
30819    8
40329    8
49221    8
37932    8
Name: Customer_ID, Length: 12500, dtype: int64
------------------------------------------------------------------------------------------
Month 
 1    12500
2    12500
3    12500
4    12500
5    12500
6    12500
7    12500
8    12500
Name: Month, dtype: int64
------------------------------------------------------------------------------------------
Name 
 Jessicad              48
Langep                48
Stevex                48
Vaughanl              40
Ronald Groverk        40
                      ..
Breidthardtj           8
Sven Egenterx          8
Antonella Ciancioc     8
Valentina Zan  


# **EDA**


In [38]:
# Create a histogram for age distribution with specified bin size
fig = px.histogram(data_frame=df, x='Age', title='Age Distribution', nbins=20)
fig.show()

# **Income Analysis**

In [39]:
import plotly.express as px

# Create individual box plots for annual income and monthly in-hand salary
fig = px.box(data_frame=df, y='Annual_Income', title='Annual Income Box Plot')
fig.show()

fig = px.box(data_frame=df, y='Monthly_Inhand_Salary', title='Monthly In-hand Salary Box Plot')
fig.show()


# **Payment Behavior**

In [40]:
# Create a pie chart for payment behavior
payment_behavior_counts = df['Payment_Behaviour'].value_counts()
fig = px.pie(values=payment_behavior_counts, names=payment_behavior_counts.index,
             title='Payment Behavior')
fig.show()


# **Label encoder**

In [41]:
# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])


In [42]:
# Sample data (replace with your actual data)
X = df[["Annual_Income", "Monthly_Inhand_Salary",
        "Num_Bank_Accounts", "Num_Credit_Card",
        "Interest_Rate", "Num_of_Loan",
        "Delay_from_due_date", "Num_of_Delayed_Payment",
        "Credit_Mix", "Outstanding_Debt",
        "Credit_History_Age", "Monthly_Balance"]]
y = df["Credit_Score"]


In [43]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [44]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    1
99996    1
99997    1
99998    2
99999    1
Name: Credit_Score, Length: 100000, dtype: int64

# **Logistic Regression**



In [45]:
# Initialize and train the Logistic Regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_lr = lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(f"Logistic Regression Accuracy: {accuracy_lr}")

Logistic Regression Accuracy: 0.5412



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# **Decision Tree**

In [46]:
# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(f"Decision Tree Accuracy: {accuracy_dt}")

Decision Tree Accuracy: 0.75245


# **Random forest**

In [47]:
# Initialize and train the Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")

Random Forest Accuracy: 0.8104


# **XG Boost**

In [48]:
# Initialize and train the XGBoost model
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_xgb = xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy_xgb}")

XGBoost Accuracy: 0.7656


# **Comparing model**

In [49]:
import plotly.graph_objects as go

# Accuracy scores for each classifier
accuracies = {
    'Logistic Regression': accuracy_lr,
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'XGBoost': accuracy_xgb
}

# Create a bar graph using Plotly
fig = go.Figure(data=[
    go.Bar(name='Accuracy', x=list(accuracies.keys()), y=list(accuracies.values()))
])

# Update the layout
fig.update_layout(title='Comparison of Model Accuracy Scores',
                  xaxis_title='Classifiers',
                  yaxis_title='Accuracy Score',
                  yaxis_range=[0.4, 1.0])

# Show the plot
fig.show()


# **New Data Predict**

In [72]:
# Sample data for prediction
sample_data = pd.DataFrame({
    'Annual_Income': [20867.670],
    'Monthly_Inhand_Salary': [6769.130000],
    'Num_Bank_Accounts': [6],
    'Num_Credit_Card': [5],
    'Interest_Rate': [8],
    'Num_of_Loan': [3],
    'Delay_from_due_date': [15],
    'Num_of_Delayed_Payment': [19],
    'Credit_Mix': [2],
    'Outstanding_Debt': [1109.03],
    'Credit_History_Age': [190],
    'Monthly_Balance': [236.241829]
})


In [73]:
# Predict using the trained RandomForestClassifier model
predictions = rf.predict(sample_data)

# Map the predicted values to labels
predicted_labels = ['Standard' if pred == 0 else 'Bad' if pred == 1 else 'Good' for pred in predictions]

# Print the predicted labels
print(predicted_labels)

['Good']
