# Imports

In [1]:
import numpy as np
import pandas as pd

# Filling NaNs

In [2]:
def fill_data_median(data: pd.DataFrame, columns_with_na: str | list) -> pd.DataFrame:
    if type(columns_with_na) is str:
        columns_with_na = [columns_with_na]
    medians = data[columns_with_na].median(axis = 0).transpose()
    filled_columns = data[columns_with_na].fillna(value=medians)
    return pd.concat([data.drop(columns_with_na, axis=1), filled_columns], axis=1)

In [3]:
def fill_data_mean(data: pd.DataFrame, columns_with_na: str | list) -> pd.DataFrame:
    if type(columns_with_na) is str:
        columns_with_na = [columns_with_na]
    medians = data[columns_with_na].mean(axis = 0).transpose()
    filled_columns = data[columns_with_na].fillna(value=medians)
    return pd.concat([data.drop(columns_with_na, axis=1), filled_columns], axis=1)

In [4]:
def fill_data_mode(data: pd.DataFrame, columns_with_na: str | list) -> pd.DataFrame:
    filled_data = data.copy()
    if type(columns_with_na) is str:
        columns_with_na = [columns_with_na]
    for column in columns_with_na:
        mode = filled_data[column].mode()[0]
        filled_data[column] = filled_data[column].fillna(mode)
    return filled_data

# Dataset

In [5]:
df = pd.read_csv('initial_dataset.csv')
df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amount,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [6]:
df = df.drop_duplicates()
df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amount,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [7]:
df = df[['person_age', 'person_income', 'person_emp_length', 'person_home_ownership', 'loan_intent', 'loan_amount', 'loan_int_rate', 'cb_person_default_on_file', 'loan_grade', 'cb_person_cred_hist_length', 'loan_status']]
df

Unnamed: 0,person_age,person_income,person_emp_length,person_home_ownership,loan_intent,loan_amount,loan_int_rate,cb_person_default_on_file,loan_grade,cb_person_cred_hist_length,loan_status
0,22,59000,123.0,RENT,PERSONAL,35000,16.02,Y,D,3,1
1,21,9600,5.0,OWN,EDUCATION,1000,11.14,N,B,2,0
2,25,9600,1.0,MORTGAGE,MEDICAL,5500,12.87,N,C,3,1
3,23,65500,4.0,RENT,MEDICAL,35000,15.23,N,C,2,1
4,24,54400,8.0,RENT,MEDICAL,35000,14.27,Y,C,4,1
...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,MORTGAGE,PERSONAL,5800,13.16,N,C,30,0
32577,54,120000,4.0,MORTGAGE,PERSONAL,17625,7.49,N,A,19,0
32578,65,76000,3.0,RENT,HOMEIMPROVEMENT,35000,10.99,N,B,28,1
32579,56,150000,5.0,MORTGAGE,PERSONAL,15000,11.48,N,B,26,0


In [8]:
mapping = {
    'person_age': 'age',
    'person_income': 'month_income',
    'person_home_ownership': 'home_ownership',
    'person_emp_length': 'employment_length',
    'loan_intent': 'loan_intent',
    'loan_amount': 'loan_amount',
    'loan_int_rate': 'loan_interest_rate',
    'loan_status': 'loan_status',
    'loan_grade': 'loan_grade',
    'cb_person_cred_hist_length': 'previous_loans',
    'cb_person_default_on_file': 'previous_default'
}
df.rename(columns=mapping, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=mapping, inplace=True)


Unnamed: 0,age,month_income,employment_length,home_ownership,loan_intent,loan_amount,loan_interest_rate,previous_default,loan_grade,previous_loans,loan_status
0,22,59000,123.0,RENT,PERSONAL,35000,16.02,Y,D,3,1
1,21,9600,5.0,OWN,EDUCATION,1000,11.14,N,B,2,0
2,25,9600,1.0,MORTGAGE,MEDICAL,5500,12.87,N,C,3,1
3,23,65500,4.0,RENT,MEDICAL,35000,15.23,N,C,2,1
4,24,54400,8.0,RENT,MEDICAL,35000,14.27,Y,C,4,1
...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,MORTGAGE,PERSONAL,5800,13.16,N,C,30,0
32577,54,120000,4.0,MORTGAGE,PERSONAL,17625,7.49,N,A,19,0
32578,65,76000,3.0,RENT,HOMEIMPROVEMENT,35000,10.99,N,B,28,1
32579,56,150000,5.0,MORTGAGE,PERSONAL,15000,11.48,N,B,26,0


In [9]:
df['loan_intent'].replace({'HOMEIMPROVEMENT': 'HOME_IMPROVEMENT', 'DEBTCONSOLIDATION': 'DEBT_CONSOLIDATION'}, inplace=True)
df['previous_default'].replace({'Y': 'YES', 'N': 'NO'}, inplace=True)
df['loan_status'] = df['loan_status'].replace({0: 1, 1: 0})

df.drop(df[df['age'] > 90].index, inplace = True) # Викидаємо людей який більше 90-та років
df.drop(df[df['month_income'] > 300_000].index, inplace = True) # Викидаємо людей які заробляють більше 300.000
df.loc[df['employment_length'] > 90, 'employment_length'] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loan_intent'].replace({'HOMEIMPROVEMENT': 'HOME_IMPROVEMENT', 'DEBTCONSOLIDATION': 'DEBT_CONSOLIDATION'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['previous_default'].replace({'Y': 'YES', 'N': 'NO'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loan_status'] = df['loan_status'].replace({0: 1, 1: 0})
A value is trying to be set on a copy of a slice from a DataFrame

In [10]:
df = fill_data_median(df, 'employment_length')
df = fill_data_mean(df, 'loan_interest_rate')

In [11]:
df['employment_length'] = df['employment_length'].astype('int')

In [12]:
df = df[['age', 'month_income', 'employment_length', 'home_ownership', 'loan_intent', 'loan_amount', 'loan_interest_rate', 'previous_loans', 'previous_default', 'loan_grade', 'loan_status']]
df

Unnamed: 0,age,month_income,employment_length,home_ownership,loan_intent,loan_amount,loan_interest_rate,previous_loans,previous_default,loan_grade,loan_status
0,22,59000,4,RENT,PERSONAL,35000,16.02,3,YES,D,0
1,21,9600,5,OWN,EDUCATION,1000,11.14,2,NO,B,1
2,25,9600,1,MORTGAGE,MEDICAL,5500,12.87,3,NO,C,0
3,23,65500,4,RENT,MEDICAL,35000,15.23,2,NO,C,0
4,24,54400,8,RENT,MEDICAL,35000,14.27,4,YES,C,0
...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1,MORTGAGE,PERSONAL,5800,13.16,30,NO,C,1
32577,54,120000,4,MORTGAGE,PERSONAL,17625,7.49,19,NO,A,1
32578,65,76000,3,RENT,HOME_IMPROVEMENT,35000,10.99,28,NO,B,0
32579,56,150000,5,MORTGAGE,PERSONAL,15000,11.48,26,NO,B,1


In [13]:
df.to_csv('credit_history.csv', index=False)