# Credit Score Predictions
### Gauri Pala, Ayuj Verma, Ayush Bhalavat, Arnav Chopra

In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [157]:
data = pd.read_csv('./train.csv')
data.head()

  data = pd.read_csv('./train.csv')


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


# About:
[insert answers to questions here]

# Data Preproccesing:

## Exploring
The following cells display how we explored the data to see what types each of our features were, observe how many null values we have, how many records are in the dataset, etc.

In [158]:
# This is more specific than simply just using df.dtypes and helps us figure out why column 26 has mixed types.
for col in data.columns:
    print(f'{col} : {pd.api.types.infer_dtype(data[col])}')
  
# We need to one-hot-encode occupation, Type of Loans, Credit Mix, Payment of Minimum amount, Payment Behaviour
# We need to fix Age, Annual Income, Number of Loans, Number of Delayed Payments, 
#   Change in Credit Limit, Outstanding Debt, Credit History Age, Amount Invested Monthly, and Monthly Balance
# TODO: FOR EVERY NUMERICAL DATA TYPE, CHECK NEGATIVES AND HIGH OUTLIERS.
    

ID : string
Customer_ID : string
Month : string
Name : string
Age : string
SSN : string
Occupation : string
Annual_Income : string
Monthly_Inhand_Salary : floating
Num_Bank_Accounts : integer
Num_Credit_Card : integer
Interest_Rate : integer
Num_of_Loan : string
Type_of_Loan : string
Delay_from_due_date : integer
Num_of_Delayed_Payment : string
Changed_Credit_Limit : string
Num_Credit_Inquiries : floating
Credit_Mix : string
Outstanding_Debt : string
Credit_Utilization_Ratio : floating
Credit_History_Age : string
Payment_of_Min_Amount : string
Total_EMI_per_month : floating
Amount_invested_monthly : string
Payment_Behaviour : string
Monthly_Balance : mixed
Credit_Score : string


In [159]:
print(data.isnull().sum())

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64


In [160]:
# Multiple types in column 26
column_data = data.iloc[ : , 26] 
print(column_data.apply(type).value_counts())

Monthly_Balance
<class 'str'>      97132
<class 'float'>     2868
Name: count, dtype: int64


In [161]:
# Ensure that each customer has the same number of records before grouping them in the next steps.
(data.Customer_ID.value_counts() != 8).any()

False

## Irrelevant Data
Some features are not relevant when training a machine learning model.

**Dropped Features:**
1. ID: identifies the record
2. Name: name of the customer
3. SSN: social security number of the customer
4. Month: month of the year for when the record occurred
These are all features that simply identify the record.

In [162]:
# Drop useless features.
data.drop(['ID', 'Name', 'SSN', 'Month', 'Type_of_Loan', 'Credit_History_Age'], axis = 1, inplace = True)
data.head()
# TODO: TAKE OUT TYPE OF LOAN AND CREDIT HISTORY AGE FROM THE DROP

Unnamed: 0,Customer_ID,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,CUS_0xd40,23,Scientist,19114.12,1824.843333,3,4,3,4,3,...,4.0,_,809.98,26.82262,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,CUS_0xd40,23,Scientist,19114.12,,3,4,3,4,-1,...,4.0,Good,809.98,31.94496,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,CUS_0xd40,-500,Scientist,19114.12,,3,4,3,4,3,...,4.0,Good,809.98,28.609352,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,CUS_0xd40,23,Scientist,19114.12,,3,4,3,4,5,...,4.0,Good,809.98,31.377862,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,CUS_0xd40,23,Scientist,19114.12,1824.843333,3,4,3,4,6,...,4.0,Good,809.98,24.797347,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


## Cleaning Each Feature
Each feature needs to be explored, cleaned, and engineered before we proceed with training a model

### Age
There were some records (like record 2) with a negative age, which does not make sense, so we decided to change the value to the mode age of the cutomer. Furthermore, since the dataset has 8 records per customer over time, people got older within these 8 records as they had their birthday. Therefore, for each customer, we assigned them the age that was the mode from the 8 records.

In [163]:
# Convert 'Age' from string to float
data['Age'] = data['Age'].str.replace('_', '')
data['Age'] = data['Age'].astype(float)

# Make all age values the same (the mode) for each customer.
grouped_data = data.groupby('Customer_ID')
mode_age = grouped_data['Age'].transform(lambda x: x.mode()[0]) # If there is a tie in mode (customer had a birthday), take the first mode.
data['Age'] = mode_age
print(data['Age'].describe())

print(data['Age'])

# Age now looks good.

count    100000.000000
mean         33.274560
std          10.764438
min          14.000000
25%          24.000000
50%          33.000000
75%          42.000000
max          56.000000
Name: Age, dtype: float64
0        23.0
1        23.0
2        23.0
3        23.0
4        23.0
         ... 
99995    25.0
99996    25.0
99997    25.0
99998    25.0
99999    25.0
Name: Age, Length: 100000, dtype: float64


### Annual Income
The original data type of this feature was a string, so we converted it to an float.

In [164]:
# Annual income should change from a string to a float.
data['Annual_Income'] = data['Annual_Income'].str.replace('_', '')
data['Annual_Income'] = data['Annual_Income'].astype(float)


# Make all Annual Income values the same (the mode) for each customer.
grouped_data = data.groupby('Customer_ID')
mode_ann_income = grouped_data['Annual_Income'].transform(lambda x: x.mode()[0]) # If there is a tie in Annual Income (customer had a promotion), take the first mode.
data['Annual_Income'].fillna(mode_ann_income, inplace=True)

print(data['Annual_Income'].describe())
# print(data['Annual_Income'])
# Annual Income now looks good.

count    1.000000e+05
mean     1.764157e+05
std      1.429618e+06
min      7.005930e+03
25%      1.945750e+04
50%      3.757861e+04
75%      7.279092e+04
max      2.419806e+07
Name: Annual_Income, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Annual_Income'].fillna(mode_ann_income, inplace=True)


### Number of Loans
The orignal data type of this feature was a string, so we converted it to a float. Furthermore, we cannot have negative loans, so we dropped those specific records. Finally, some records have really large outliers that simply do not make sense in a real world context, so we dropped those records as well. There are no records with null values for number of loans, so we don't need to do anything there.

In [165]:
# Number of loans should change from a string to a float.
data['Num_of_Loan'] = data['Num_of_Loan'].str.replace('_', '')
data['Num_of_Loan'] = data['Num_of_Loan'].astype(float)
negative_loans = data[data['Num_of_Loan'] < 0]['Num_of_Loan'].value_counts()
print(negative_loans)

grouped_data = data.groupby('Customer_ID')
mode_num_loans = grouped_data['Num_of_Loan'].transform(lambda x: x.mode())

# People can't have negative loans, so we drop these records.
negative_loan_indices = data[data['Num_of_Loan'] < 0].index
data.iloc[negative_loan_indices, data.columns.get_loc('Num_of_Loan')] = mode_num_loans.iloc[negative_loan_indices]
data['Num_of_Loan'] = data['Num_of_Loan'].fillna(mode_num_loans)
print(data.isnull().sum())
# data = data.drop(negative_loan_indices)
# data.reset_index(drop = True, inplace = True)
# negative_loans = data[data['Num_of_Loan'] < 0]['Num_of_Loan'].any()
# print(negative_loans)
# No more negative loans.
print(data['Num_of_Loan'].describe())
print(data['Num_of_Loan'].size)
# Some people have 1000+ loans. We will determine outliers to be more than 10 loans.
# large_loan_indices = data[data['Num_of_Loan'] > 10].index
# data = data.drop(large_loan_indices)
# data.reset_index(drop = True, inplace = True)
# large_loans = data[data['Num_of_Loan'] > 10]['Num_of_Loan'].any()
# print(large_loans)
# Number of Loans now looks good.

Num_of_Loan
-100.0    3876
Name: count, dtype: int64
Customer_ID                     0
Age                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                  3876
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64
count    96124.000000
mean         7.163622
std         60.314923
min          0.000000
25%          2.000000
50%          3.000000
75%          5.000000
max       1496.000000
Name: Num_

### Number of Delayed Payments
The orignal data type of this feature was a string, so we converted it to a float. Furthermore, we cannot have a negative amount of delayed payments, so we dropped those specific records. Finally, some records have really large outliers that simply do not make sense in a real world context, so we dropped those records as well. Lastly, we impute the null values in this feature with the median, which is 14.

In [11]:
# Number of delayed payments should change from a string to a float.
data['Num_of_Delayed_Payment'] = data['Num_of_Delayed_Payment'].str.replace('_', '')
data['Num_of_Delayed_Payment'] = data['Num_of_Delayed_Payment'].astype(float)

negative_delayed_payments = data[data['Num_of_Delayed_Payment'] < 0]['Num_of_Delayed_Payment'].value_counts()
print(negative_delayed_payments)
# People can't have negative delayed payments, so we drop these records.
negative_delayed_payments = data[data['Num_of_Delayed_Payment'] < 0].index
data = data.drop(negative_delayed_payments)
data.reset_index(drop = True, inplace = True)
negative_delayed_payments = data[data['Num_of_Delayed_Payment'] < 0]['Num_of_Delayed_Payment'].any()
print(negative_delayed_payments)
# No more negative delayed payments.

num_delayed_payments = data["Num_of_Delayed_Payment"].value_counts()
print(num_delayed_payments)
# Some of these are in the thousands.
print(data["Num_of_Delayed_Payment"].describe())
# The 25th percentile is 9 and the 75th percentile is 18, so the IQR is 9. Outliers are anything above 18 + 1.5 * 9 = 31.5.
large_delayed_payments = data[data['Num_of_Delayed_Payment'] > 31.5].index
data = data.drop(large_delayed_payments)
data.reset_index(drop = True, inplace = True)
large_delayed_payments = data[data['Num_of_Delayed_Payment'] > 10]['Num_of_Delayed_Payment'].any()
print(large_delayed_payments)
print(data["Num_of_Delayed_Payment"].describe())

# Handle NaNs
print(data['Num_of_Delayed_Payment'].isnull().sum())
data['Num_of_Delayed_Payment'].fillna(14, inplace = True)
print(data['Num_of_Delayed_Payment'].isnull().sum())

# Number of Delayed Payments now looks good.

Num_of_Delayed_Payment
-1.0    297
-2.0    221
-3.0     93
Name: count, dtype: int64
False
Num_of_Delayed_Payment
19.0      5190
17.0      5181
10.0      5104
16.0      5076
15.0      5019
          ... 
4134.0       1
1530.0       1
1502.0       1
4075.0       1
2047.0       1
Name: count, Length: 682, dtype: int64
count    88371.000000
mean        31.261047
std        227.582135
min          0.000000
25%          9.000000
50%         14.000000
75%         18.000000
max       4397.000000
Name: Num_of_Delayed_Payment, dtype: float64
True
count    87663.000000
mean        13.417029
std          6.207115
min          0.000000
25%          9.000000
50%         14.000000
75%         18.000000
max         28.000000
Name: Num_of_Delayed_Payment, dtype: float64
6670
0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Num_of_Delayed_Payment'].fillna(14, inplace = True)


### Changed Credit Limit
The orignal data type of this feature was a string, so we converted it to a float.

In [12]:
# Change in credit limit should change from a string to a float.
data['Changed_Credit_Limit'] = data['Changed_Credit_Limit'].str.replace('_', '')
data['Changed_Credit_Limit'] = data['Changed_Credit_Limit'].replace("", np.nan)
data['Changed_Credit_Limit'] = data['Changed_Credit_Limit'].astype(float)
print(data['Changed_Credit_Limit'].describe())
# Change in credit limit now looks good. The mean is close to the median and negative values are okay.

count    92366.000000
mean        10.421804
std          6.801620
min         -6.490000
25%          5.350000
50%          9.430000
75%         14.960000
max         36.490000
Name: Changed_Credit_Limit, dtype: float64


### Amount Invested Monthly

In [13]:
data['Amount_invested_monthly'] = data['Amount_invested_monthly'].str.replace('_', '')
data['Amount_invested_monthly'] = data['Amount_invested_monthly'].astype(float)

### Monthly Balance

In [14]:
data['Monthly_Balance'].describe()
data.drop(data[data["Monthly_Balance"]=='__-333333333333333333333333333__'].index,inplace=True)
data['Monthly_Balance'] = data['Monthly_Balance'].str.replace('_', '')
data['Monthly_Balance'] = data['Monthly_Balance'].astype(float)

mean_montly_bal = data["Monthly_Balance"].mean()
data["Monthly_Balance"].fillna(mean_montly_bal, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Monthly_Balance"].fillna(mean_montly_bal, inplace=True)


### Outstanding Debt

In [15]:
data['Outstanding_Debt'] = data['Outstanding_Debt'].str.replace('_', '')
data['Outstanding_Debt'] = data['Outstanding_Debt'].astype(float)

### Credit History Age

In [16]:
# data['Credit_History_Age'] = data['Credit_History_Age'].dropna(how='NaN')
# print(data['Credit_History_Age'].isnull().sum())
# def age_to_months(age_str):

#     # if pd.isnull(age_str):
#     #     return 0  # Return 0 for NaN values
#     years, months = map(int, age_str.split(' Years and ')[1].split(' Months'))
#     total_months = years * 12 + months
#     return total_months

# # Apply the function to the 'age' column
# data['Credit_History_Age_Months'] = data['Credit_History_Age'].apply(age_to_months)

# # grouped_data = data.groupby('Customer_ID')
# # mean_credit_age = grouped_data['Credit_History_Age_Months'].mean()
# # data['Credit_History_Age'].fillna(mean_credit_age, inplace=True)


# print(data['Credit_History_Age'])

# data['Credit_History_Age'] = data['Credit_History_Age'].str.replace('_', '')
# data['Credit_History_Age'] = data['Credit_History_Age'].astype(float)

### Monthly Inhand Salary

In [17]:
mean_salary = data["Monthly_Inhand_Salary"].mean()
data["Monthly_Inhand_Salary"].fillna(mean_salary, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Monthly_Inhand_Salary"].fillna(mean_salary, inplace=True)


### Changed Credit Limit

In [18]:
changed_mean=data["Changed_Credit_Limit"].mean()
data["Changed_Credit_Limit"].fillna(changed_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Changed_Credit_Limit"].fillna(changed_mean, inplace=True)


### Number of Credit Inquiries

In [19]:
in_mean=data["Num_Credit_Inquiries"].mean()
data["Num_Credit_Inquiries"].fillna(in_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Num_Credit_Inquiries"].fillna(in_mean, inplace=True)


### Amount Invested Monthly

In [20]:
data["Amount_invested_monthly"] = pd.to_numeric(data["Amount_invested_monthly"], errors="coerce")
invested_mean=data["Amount_invested_monthly"].mean()
data["Amount_invested_monthly"].fillna(invested_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Amount_invested_monthly"].fillna(invested_mean, inplace=True)


## One Hot Encoding
The following features need to be one hot encoded:
Type of Loans, Credit Mix, Payment of Minimum amount, Payment Behaviour, Occupation

### Occupation
Drop the records that don't have an occupation since we can't just assign the mean or median as it is categorical. One hot encode the rest of the records

In [21]:
data.drop(data[data["Occupation"]=='_______'].index,inplace=True)
data = pd.get_dummies(data=data, columns=['Occupation'], drop_first=True)
data.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Occupation_Entrepreneur,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer
0,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,3,7.0,...,False,False,False,False,False,False,False,True,False,False
1,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,-1,14.0,...,False,False,False,False,False,False,False,True,False,False
2,CUS_0xd40,-500.0,19114.12,4183.562918,3,4,3,4.0,3,7.0,...,False,False,False,False,False,False,False,True,False,False
3,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,5,4.0,...,False,False,False,False,False,False,False,True,False,False
4,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,6,14.0,...,False,False,False,False,False,False,False,True,False,False


### Credit Mix
One hot encoded the values and dropped the records with no credit mix.

In [22]:
data.drop(data[data["Credit_Mix"]=='_'].index,inplace=True)
data = pd.get_dummies(data=data, columns=['Credit_Mix'], drop_first=True)
data.head()


Unnamed: 0,Customer_ID,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Credit_Mix_Good,Credit_Mix_Standard
1,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,-1,14.0,...,False,False,False,False,False,True,False,False,True,False
2,CUS_0xd40,-500.0,19114.12,4183.562918,3,4,3,4.0,3,7.0,...,False,False,False,False,False,True,False,False,True,False
3,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,5,4.0,...,False,False,False,False,False,True,False,False,True,False
4,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,6,14.0,...,False,False,False,False,False,True,False,False,True,False
5,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,8,4.0,...,False,False,False,False,False,True,False,False,True,False


### Payment Behavior

In [23]:
data.drop(data[data["Payment_Behaviour"]=='!@9#%8'].index,inplace=True)
data = pd.get_dummies(data=data, columns=['Payment_Behaviour'], drop_first=True)
data.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Credit_Mix_Good,Credit_Mix_Standard,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
1,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,-1,14.0,...,True,False,False,True,False,False,False,True,False,False
2,CUS_0xd40,-500.0,19114.12,4183.562918,3,4,3,4.0,3,7.0,...,True,False,False,True,False,False,False,False,True,False
3,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,5,4.0,...,True,False,False,True,False,False,False,False,False,True
4,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,6,14.0,...,True,False,False,True,False,True,False,False,False,False
6,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,3,8.0,...,True,False,False,True,False,False,False,False,False,True


### Payment of Minimum Amount

In [24]:
data.drop(data[data["Payment_of_Min_Amount"]=='!@9#%8'].index,inplace=True)
data = pd.get_dummies(data=data, columns=['Payment_of_Min_Amount'], drop_first=True)
data.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Occupation_Writer,Credit_Mix_Good,Credit_Mix_Standard,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes
1,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,-1,14.0,...,False,True,False,False,False,True,False,False,True,False
2,CUS_0xd40,-500.0,19114.12,4183.562918,3,4,3,4.0,3,7.0,...,False,True,False,False,False,False,True,False,True,False
3,CUS_0xd40,23.0,19114.12,4183.562918,3,4,3,4.0,5,4.0,...,False,True,False,False,False,False,False,True,True,False
4,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,6,14.0,...,False,True,False,True,False,False,False,False,True,False
6,CUS_0xd40,23.0,19114.12,1824.843333,3,4,3,4.0,3,8.0,...,False,True,False,False,False,False,False,True,True,False


In [25]:
for col in data.columns:
    print(f'{col} : {pd.api.types.infer_dtype(data[col])}')

print(data.isnull().sum())

Customer_ID : string
Age : floating
Annual_Income : floating
Monthly_Inhand_Salary : floating
Num_Bank_Accounts : integer
Num_Credit_Card : integer
Interest_Rate : integer
Num_of_Loan : floating
Delay_from_due_date : integer
Num_of_Delayed_Payment : floating
Changed_Credit_Limit : floating
Num_Credit_Inquiries : floating
Outstanding_Debt : floating
Credit_Utilization_Ratio : floating
Total_EMI_per_month : floating
Amount_invested_monthly : floating
Monthly_Balance : floating
Credit_Score : string
Occupation_Architect : boolean
Occupation_Developer : boolean
Occupation_Doctor : boolean
Occupation_Engineer : boolean
Occupation_Entrepreneur : boolean
Occupation_Journalist : boolean
Occupation_Lawyer : boolean
Occupation_Manager : boolean
Occupation_Mechanic : boolean
Occupation_Media_Manager : boolean
Occupation_Musician : boolean
Occupation_Scientist : boolean
Occupation_Teacher : boolean
Occupation_Writer : boolean
Credit_Mix_Good : boolean
Credit_Mix_Standard : boolean
Payment_Behaviou

# Building the Machine Learning Model

### Holdout Method for Decision Trees

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

data = data.drop(['Customer_ID'], axis=1) # Features for the data

data_y = data['Credit_Score'] # Labels for the data
data_x = data.drop(['Credit_Score'], axis=1) # Features for the data

print(data_x)

X_train, X_test, Y_train, Y_test = train_test_split(data_x, data_y, test_size=.2, train_size=.8)
print(f'Records in Training Set: {len(X_train)}')
print(f'Records in Testing Set: {len(X_test)}')

# Decision Tree
clf = DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X=X_train, y=Y_train) # Fit decision tree on the training set

# Make Predictions
predictions = clf.predict(X=X_test)

# Check Accuracy
accuracy = accuracy_score(y_true=Y_test, y_pred=predictions)

print(f'Accuracy on this training set is {accuracy * 100}%')

         Age  Annual_Income  Monthly_Inhand_Salary  Num_Bank_Accounts  \
1       23.0       19114.12            4183.562918                  3   
2     -500.0       19114.12            4183.562918                  3   
3       23.0       19114.12            4183.562918                  3   
4       23.0       19114.12            1824.843333                  3   
6       23.0       19114.12            1824.843333                  3   
...      ...            ...                    ...                ...   
94323   28.0       20002.88            1929.906667                 10   
94324   29.0       20002.88            1929.906667                 10   
94326   25.0       39628.99            3359.415833                  4   
94330   25.0       39628.99            3359.415833                  4   
94331   25.0       39628.99            3359.415833                  4   

       Num_Credit_Card  Interest_Rate  Num_of_Loan  Delay_from_due_date  \
1                    4              3          4

### Nested Cross Validation with Decision Trees

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier()
data_y = data['Credit_Score'] # Labels for the data
data_x = data.drop(['Credit_Score'], axis=1) # Features for the data
params = {"max_depth": [5,10,15,20] ,
         "min_samples_leaf": [5,10,15,20], "max_features": [5,10,15]}
grid_search = GridSearchCV(clf, params, cv=5, scoring='accuracy')

grid_search.fit(data_x, data_y)

# What did it find as the best max_depth?
print("Best Parameters:", grid_search.best_params_)
# What was the accuracy at this best max_depth?
print(f'Accuracy: {grid_search.best_score_*100}%')

Best Parameters: {'max_depth': 10, 'max_features': 15, 'min_samples_leaf': 5}
Accuracy: 69.26648925720204%


### Naive Bayes and Nested Cross Validation

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

gnb = GaussianNB()
accuracies = cross_val_score(gnb, X=data_x, y=data_y, cv=10)
print(f'Average Accuracy: {sum(accuracies) / len(accuracies) * 100}%')

Average Accuracy: 54.113312405991074%


### KNN

In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
pca = PCA()
knn = KNeighborsClassifier(n_neighbors=7)
pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])
accuracies = cross_val_score(estimator=pipeline, X=data_x, y=data_y, cv=5)
print(f'Average Accuracy: {sum(accuracies) / len(accuracies) * 100}%')

Average Accuracy: 62.19450318623954%


### SVM

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

scaler = StandardScaler()
pca = PCA()
svc = SVC()
pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('svc', svc)])

param_grid = {
    'pca__n_components': list(range(5, 8)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}


grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X=data_x, y=data_y)

nested_preds = cross_val_predict(grid_search, data_x, data_y, cv=10)

print(f"Accuracy: {accuracy_score(y_true=data_y, y_pred=nested_preds)}")
print('Classification Report:')
print(classification_report(y_true=data_y, y_pred=nested_preds))


KeyboardInterrupt: 

### Neural Network

In [32]:
from sklearn.neural_network import MLPClassifier

pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('neural_net', MLPClassifier())])
param_grid = {
    'neural_net__hidden_layer_sizes': list(range(30, 61, 10)),
    'neural_net__activation': ['logistic', 'tanh', 'relu']
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
nested_score = cross_val_score(grid_search, data_x, data_y, cv=5)
print(f'Accuracy: {nested_score.mean()*100}%')


