In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [163]:
df = pd.read_csv("train_cleaned.csv",low_memory=False)

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89930 entries, 0 to 89929
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     89930 non-null  int64  
 1   Name                      89930 non-null  object 
 2   Age                       89930 non-null  int64  
 3   SSN                       89930 non-null  object 
 4   Occupation                89930 non-null  object 
 5   Annual_Income             89930 non-null  float64
 6   Monthly_Inhand_Salary     89930 non-null  float64
 7   Num_Bank_Accounts         89930 non-null  int64  
 8   Num_Credit_Card           89930 non-null  int64  
 9   Interest_Rate             89930 non-null  int64  
 10  Num_of_Loan               89930 non-null  int64  
 11  Type_of_Loan              89930 non-null  object 
 12  Delay_from_due_date       89930 non-null  int64  
 13  Num_of_Delayed_Payment    89930 non-null  int64  
 14  Change

| **Feature**                        | **Retained/Dropped**                                                                 |
|------------------------------------|-------------------------------------------------------------------------------------|
| **Month**                          | Dropped                                                   |
| **Name**                           | Dropped                                               |
| **Age**                            | Retained                                     |
| **SSN**                            | Dropped            |
| **Occupation**                     | Retained         |
| **Annual_Income**                  | Retained                       |
| **Monthly_Inhand_Salary**          | Dropped                                      |
| **Num_Bank_Accounts**              | Dropped                              |
| **Num_Credit_Card**                | Retained  |
| **Interest_Rate**                  | Retained                      |
| **Num_of_Loan**                    | Retained                  |
| **Type_of_Loan**                   | Retained                                      |
| **Delay_from_due_date**            | Dropped                                |
| **Num_of_Delayed_Payment**         | Dropped
| **Changed_Credit_Limit**           | Retained                            |
| **Num_Credit_Inquiries**           | Dropped                                    |
| **Credit_Mix**                     | Dropped                        |
| **Outstanding_Debt**               | Retained:                                                                           |
| **Credit_Utilization_Ratio**       | Retained:                                                                           |
| **Credit_History_Age**             | Dropped:                                                                            |
| **Payment_of_Min_Amount**          | Dropped:                                                                            |
| **Total_EMI_per_month**            | Retained:                                                                           |
| **Amount_invested_monthly**        | Dropped:                                                                            |
| **Payment_Behaviour**              | Retained:                                                                           |
| **Monthly_Balance**                | Dropped:                                                                            |
| **Credit_Score**                   | Retained:                                                                           |


In [165]:
df.drop(['Month',
         'Name',
         'SSN',
         'Monthly_Inhand_Salary',
         'Num_Bank_Accounts',
         'Delay_from_due_date',
         'Num_of_Delayed_Payment',
         'Num_Credit_Inquiries',
         'Credit_Mix',
         'Credit_History_Age',
         'Payment_of_Min_Amount',
         'Amount_invested_monthly',
         'Monthly_Balance'],
         axis=1,
         inplace=True)


In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89930 entries, 0 to 89929
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       89930 non-null  int64  
 1   Occupation                89930 non-null  object 
 2   Annual_Income             89930 non-null  float64
 3   Num_Credit_Card           89930 non-null  int64  
 4   Interest_Rate             89930 non-null  int64  
 5   Num_of_Loan               89930 non-null  int64  
 6   Type_of_Loan              89930 non-null  object 
 7   Changed_Credit_Limit      89930 non-null  float64
 8   Outstanding_Debt          89930 non-null  float64
 9   Credit_Utilization_Ratio  89930 non-null  float64
 10  Total_EMI_per_month       89930 non-null  float64
 11  Payment_Behaviour         89930 non-null  object 
 12  Credit_Score              89930 non-null  object 
dtypes: float64(5), int64(4), object(4)
memory usage: 8.9+ MB


## Converting String columns to Numerical for ML models. 

In [167]:
df['Occupation'].unique()

array(['Scientist', 'Teacher', 'Engineer', 'Entrepreneur', 'Developer',
       'Lawyer', 'Media_Manager', 'Doctor', 'Journalist', 'Manager',
       'Accountant', 'Musician', 'Mechanic', 'Writer', 'Architect',
       'Unknown'], dtype=object)

In [168]:
df['Type_of_Loan'].unique()

array(['Auto Loan', 'Credit-Builder Loan', 'Not Specified', 'missing',
       'Personal Loan', 'Payday Loan', 'Mortgage Loan',
       'Home Equity Loan', 'Debt Consolidation Loan', 'Student Loan'],
      dtype=object)

In [169]:
df['Payment_Behaviour'].unique()

array(['High_spent_Medium_value_payments',
       'High_spent_Large_value_payments',
       'Low_spent_Large_value_payments', 'Low_spent_Small_value_payments',
       'Low_spent_Medium_value_payments',
       'High_spent_Small_value_payments'], dtype=object)

In [170]:
df['Credit_Score'].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

# one hot encoding

In [171]:
def include_in_df(x, df=df):
    df1 = pd.get_dummies(df[x], drop_first=True, dtype=int)
    ddf = pd.concat([df,df1],axis=1)
    return ddf

In [172]:
df1 = include_in_df('Type_of_Loan',df)
df1 =include_in_df('Occupation',df)
df1.drop(['Type_of_Loan','Occupation'], axis=1, inplace=True)

In [173]:
df1.head()

Unnamed: 0,Age,Annual_Income,Num_Credit_Card,Interest_Rate,Num_of_Loan,Changed_Credit_Limit,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Payment_Behaviour,...,Journalist,Lawyer,Manager,Mechanic,Media_Manager,Musician,Scientist,Teacher,Unknown,Writer
0,23,19114.12,4,3,4,11.27,809.98,22.537593,49.574949,High_spent_Medium_value_payments,...,0,0,0,0,0,0,1,0,0,0
1,23,19114.12,4,3,4,11.27,809.98,22.537593,49.574949,High_spent_Medium_value_payments,...,0,0,0,0,0,0,1,0,0,0
2,32,19114.12,4,3,4,11.27,809.98,22.537593,49.574949,High_spent_Medium_value_payments,...,0,0,0,0,0,0,1,0,0,0
3,23,19114.12,4,3,4,11.27,809.98,22.537593,49.574949,High_spent_Medium_value_payments,...,0,0,0,0,0,0,1,0,0,0
4,23,19114.12,4,3,4,11.27,809.98,22.537593,49.574949,High_spent_Medium_value_payments,...,0,0,0,0,0,0,1,0,0,0


## Nominal Encoding

In [174]:
df['Payment_Behaviour'].unique()

array(['High_spent_Medium_value_payments',
       'High_spent_Large_value_payments',
       'Low_spent_Large_value_payments', 'Low_spent_Small_value_payments',
       'Low_spent_Medium_value_payments',
       'High_spent_Small_value_payments'], dtype=object)

In [175]:
def spent(x):
        if x =='High_spent_Large_value_payments': return 6
        elif x =='High_spent_Medium_value_payments': return 5
        elif x =='High_spent_Small_value_payments':  return 4
        elif x =='Low_spent_Large_value_payments':   return 3
        elif x =='Low_spent_Medium_value_payments':  return 2
        elif x =='Low_spent_Small_value_payments':   return 1

In [176]:
def c_score(x):
    if x == 'Good':return 2
    elif x == 'Standard':return 1
    else: return 0
    

In [177]:
df1['Payment_Behaviour'] = df['Payment_Behaviour'].apply(spent)
df1['Credit_Score'] = df['Credit_Score'].apply(c_score)

In [181]:
df1['Payment_Behaviour'].unique()

array([5, 6, 3, 1, 2, 4])

In [182]:
df1['Credit_Score'].unique()

array([2, 1, 0])

## Train dataset is ready let's make test dataset ready

In [183]:
test = pd.read_csv('test_cleaned.csv',low_memory=False)

In [184]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44945 entries, 0 to 44944
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     44945 non-null  int64  
 1   Name                      44945 non-null  object 
 2   Age                       44945 non-null  int64  
 3   SSN                       44945 non-null  object 
 4   Occupation                44945 non-null  object 
 5   Annual_Income             44945 non-null  float64
 6   Monthly_Inhand_Salary     44945 non-null  float64
 7   Num_Bank_Accounts         44945 non-null  int64  
 8   Num_Credit_Card           44945 non-null  int64  
 9   Interest_Rate             44945 non-null  int64  
 10  Num_of_Loan               44945 non-null  int64  
 11  Type_of_Loan              44945 non-null  object 
 12  Delay_from_due_date       44945 non-null  int64  
 13  Num_of_Delayed_Payment    44945 non-null  int64  
 14  Change

In [185]:
test.columns

Index(['Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance'],
      dtype='object')

In [186]:
def test_cleaner(df):
    df.drop(['Month',
         'Name',
         'SSN',
         'Monthly_Inhand_Salary',
         'Num_Bank_Accounts',
         'Delay_from_due_date',
         'Num_of_Delayed_Payment',
         'Num_Credit_Inquiries',
         'Credit_Mix',
         'Credit_History_Age',
         'Payment_of_Min_Amount',
         'Amount_invested_monthly',
         'Monthly_Balance'],
         axis=1,
         inplace=True)
    df1 = include_in_df('Type_of_Loan',df)
    df1 =include_in_df('Occupation',df)
    df1['Payment_Behaviour'] = df['Payment_Behaviour'].apply(spent)
    df1.drop(['Type_of_Loan','Occupation'], axis=1, inplace=True)
    return df1


In [187]:
test_df = test_cleaner(test)

In [188]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89930 entries, 0 to 89929
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       89930 non-null  int64  
 1   Annual_Income             89930 non-null  float64
 2   Num_Credit_Card           89930 non-null  int64  
 3   Interest_Rate             89930 non-null  int64  
 4   Num_of_Loan               89930 non-null  int64  
 5   Changed_Credit_Limit      89930 non-null  float64
 6   Outstanding_Debt          89930 non-null  float64
 7   Credit_Utilization_Ratio  89930 non-null  float64
 8   Total_EMI_per_month       89930 non-null  float64
 9   Payment_Behaviour         89930 non-null  int64  
 10  Credit_Score              89930 non-null  int64  
 11  Architect                 89930 non-null  int64  
 12  Developer                 89930 non-null  int64  
 13  Doctor                    89930 non-null  int64  
 14  Engine

In [189]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44945 entries, 0 to 44944
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       44945 non-null  int64  
 1   Annual_Income             44945 non-null  float64
 2   Num_Credit_Card           44945 non-null  int64  
 3   Interest_Rate             44945 non-null  int64  
 4   Num_of_Loan               44945 non-null  int64  
 5   Changed_Credit_Limit      44945 non-null  float64
 6   Outstanding_Debt          44945 non-null  float64
 7   Credit_Utilization_Ratio  44945 non-null  float64
 8   Total_EMI_per_month       44945 non-null  float64
 9   Payment_Behaviour         44945 non-null  int64  
 10  Architect                 44945 non-null  int64  
 11  Developer                 44945 non-null  int64  
 12  Doctor                    44945 non-null  int64  
 13  Engineer                  44945 non-null  int64  
 14  Entrep

#  ML


In [None]:
from sklearn.model_selection import train_test_split
train , eval = train_test_split()

In [190]:
x_train = df1.drop('Credit_Score',axis=1)
y_train = df1['Credit_Score']

In [194]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train,y_train)


In [197]:
y_pred = model.predict(test_df)

In [198]:
y_pred

array([2, 2, 2, ..., 0, 0, 0])