In [1]:
import numpy as np
import pandas as pd 

In [2]:
df_train = pd.read_csv("../input/credit-dset/train.csv")

  df_train = pd.read_csv("../input/credit-dset/train.csv")


In [3]:
# string -> no of months
def convert_to_months(age_str):
    if pd.isna(age_str):
        return np.nan
    else : 
        parts = age_str.split(' and ')
        years = int(parts[0].split()[0])
        months = int(parts[1].split()[0])
        total_months = (years * 12) + months
        return total_months

In [4]:
loan_type_col = df_train['Loan_Type']
# dropping columns
df_train = df_train.drop(['Name','Loan_Type'], axis=1)
# base salary -> number
df_train['Base_Salary_PerMonth'] = pd.to_numeric(df_train['Base_Salary_PerMonth'],downcast = 'float',errors = 'coerce')
# Total delayed payments (removing underscores if present) -> number
df_train['Total_Delayed_Payments'] = df_train['Total_Delayed_Payments'].str.replace(r'[^-0-9]', '', regex=True)
df_train['Total_Delayed_Payments'] = pd.to_numeric(df_train['Total_Delayed_Payments'],downcast = 'float',errors = 'coerce')
# credit history age -> number (to no of months)
df_train['Credit_History_Age'] = df_train['Credit_History_Age'].apply(convert_to_months)
df_train['Credit_History_Age'] = pd.to_numeric(df_train['Credit_History_Age'],downcast = 'float',errors = 'coerce')

In [5]:
print(df_train.info(),end = "\n\n")
print(loan_type_col)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        80000 non-null  object 
 1   Customer_ID               80000 non-null  object 
 2   Month                     80000 non-null  object 
 3   Age                       80000 non-null  object 
 4   Number                    80000 non-null  object 
 5   Profession                80000 non-null  object 
 6   Income_Annual             80000 non-null  object 
 7   Base_Salary_PerMonth      67968 non-null  float32
 8   Total_Bank_Accounts       80000 non-null  int64  
 9   Total_Credit_Cards        80000 non-null  int64  
 10  Rate_Of_Interest          80000 non-null  int64  
 11  Total_Current_Loans       80000 non-null  object 
 12  Delay_from_due_date       80000 non-null  int64  
 13  Total_Delayed_Payments    74405 non-null  float32
 14  Credit

In [6]:
df_train.drop_duplicates(inplace=True)
print(df_train.isna().sum().to_string())
print(df_train.shape)

ID                              0
Customer_ID                     0
Month                           0
Age                             0
Number                          0
Profession                      0
Income_Annual                   0
Base_Salary_PerMonth        12032
Total_Bank_Accounts             0
Total_Credit_Cards              0
Rate_Of_Interest                0
Total_Current_Loans             0
Delay_from_due_date             0
Total_Delayed_Payments       5595
Credit_Limit                    0
Total_Credit_Enquiries       1549
Credit_Mix                      0
Current_Debt_Outstanding        0
Ratio_Credit_Utilization        0
Credit_History_Age           7240
Payment_of_Min_Amount           0
Per_Month_EMI                   0
Monthly_Investment           3605
Payment_Behaviour               0
Monthly_Balance               950
Credit_Score                    0
(80000, 26)


In [7]:
null_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_cols = null_percentages.loc[null_percentages > 0]
null_cols

Base_Salary_PerMonth      15.04000
Total_Delayed_Payments     6.99375
Total_Credit_Enquiries     1.93625
Credit_History_Age         9.05000
Monthly_Investment         4.50625
Monthly_Balance            1.18750
dtype: float64

In [8]:
rows_to_drop = null_cols.loc[null_cols < 5]
df_train.dropna(subset = rows_to_drop.keys(),inplace=True,how='any',axis=0)
print(df_train.isna().sum().to_string())
print(df_train.shape)

ID                              0
Customer_ID                     0
Month                           0
Age                             0
Number                          0
Profession                      0
Income_Annual                   0
Base_Salary_PerMonth        11110
Total_Bank_Accounts             0
Total_Credit_Cards              0
Rate_Of_Interest                0
Total_Current_Loans             0
Delay_from_due_date             0
Total_Delayed_Payments       5184
Credit_Limit                    0
Total_Credit_Enquiries          0
Credit_Mix                      0
Current_Debt_Outstanding        0
Ratio_Credit_Utilization        0
Credit_History_Age           6694
Payment_of_Min_Amount           0
Per_Month_EMI                   0
Monthly_Investment              0
Payment_Behaviour               0
Monthly_Balance                 0
Credit_Score                    0
(74028, 26)


In [9]:
columns_to_drop = null_cols.loc[null_cols > 40]
df_train.drop(columns = columns_to_drop.keys(),inplace = True)
df_train.drop_duplicates(inplace=True)
print(df_train.shape)

(74028, 26)


In [10]:
null_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_cols = null_percentages.loc[null_percentages > 0]
print(null_cols,end = "\n\n")
col_impute = null_cols.loc[(null_cols >= 5) & (null_cols < 40)]
for column in col_impute.keys():
    central_tend = df_train[column].mean()
    df_train[column] = df_train[column].fillna(central_tend)
print(df_train.info())

Base_Salary_PerMonth      15.007835
Total_Delayed_Payments     7.002756
Credit_History_Age         9.042524
dtype: float64

<class 'pandas.core.frame.DataFrame'>
Index: 74028 entries, 0 to 79999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        74028 non-null  object 
 1   Customer_ID               74028 non-null  object 
 2   Month                     74028 non-null  object 
 3   Age                       74028 non-null  object 
 4   Number                    74028 non-null  object 
 5   Profession                74028 non-null  object 
 6   Income_Annual             74028 non-null  object 
 7   Base_Salary_PerMonth      74028 non-null  float32
 8   Total_Bank_Accounts       74028 non-null  int64  
 9   Total_Credit_Cards        74028 non-null  int64  
 10  Rate_Of_Interest          74028 non-null  int64  
 11  Total_Current_Loans       74028 non-null  object 
 1