In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read the data
credit_data = pd.read_csv('credit_scoring_eng.csv')
credit_data.head()

Unnamed: 0,children,days_employed,dob_years,education,education_id,family_status,family_status_id,gender,income_type,debt,total_income,purpose
0,1,-8437.673028,42,bachelor's degree,0,married,0,F,employee,0,40620.102,purchase of the house
1,1,-4024.803754,36,secondary education,1,married,0,F,employee,0,17932.802,car purchase
2,0,-5623.42261,33,Secondary Education,1,married,0,M,employee,0,23341.752,purchase of the house
3,3,-4124.747207,32,secondary education,1,married,0,M,employee,0,42820.568,supplementary education
4,0,340266.072047,53,secondary education,1,civil partnership,1,F,retiree,0,25378.572,to have a wedding


In [3]:
#Look at the data types and nulls of columns in the DataFrame
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     19351 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      19351 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB


In [4]:
#What is the shape of the DataFrame?
credit_data.shape

(21525, 12)

In [5]:
#I'll drop the 'days_employed' column since it will not provide any information in my analysis.
#credit_data.drop(['days_employed'], axis=1, inplace=True)

In [6]:
#Identify and fill in missing values
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     19351 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      19351 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB


In [7]:
#count number of nulls in every column
credit_data.isnull().sum()
#total_income has 2174 null values.

children               0
days_employed       2174
dob_years              0
education              0
education_id           0
family_status          0
family_status_id       0
gender                 0
income_type            0
debt                   0
total_income        2174
purpose                0
dtype: int64

In [8]:
#I assume that null values in the total_income and days_employed are not missing by random.*Check notes to explain further
#They are related, meaning no employment.
#I'll fill in the null values with 0 in the total_income and days_employed columns
credit_data.fillna(value=0, axis=1, inplace=True)
credit_data.isnull().sum()

children            0
days_employed       0
dob_years           0
education           0
education_id        0
family_status       0
family_status_id    0
gender              0
income_type         0
debt                0
total_income        0
purpose             0
dtype: int64

In [9]:
#Replace floats with integer data type
print(credit_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     21525 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      21525 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB
None


In [10]:
#I'll relace the columns with type 'float64' to type 'int64' using np.int64
#days_employed to int
credit_data['days_employed'] = credit_data['days_employed'].astype(np.int64)

In [11]:
#total_income to int
credit_data['total_income'] = credit_data['total_income'].astype(np.int64)

In [12]:
#Lookin at duplicated rows in the data
print(credit_data.duplicated().sum())

54


In [13]:
#drop duplicate rows
#I use the keyword argument 'ignore_index', so that i dont have to reset the index
credit_data.drop_duplicates(inplace=True, ignore_index=True)

In [14]:
#the shape of my DataFrame has changed because of the duplicate rows that have been removed.
credit_data.shape

(21471, 12)

In [15]:
#the age column has 0 values. This could be due to during collection, the client did not give his/her age.
#I will set the 0 to the median of the age column
credit_data.loc[credit_data['dob_years'] == 0, 'dob_years'] = credit_data['dob_years'].median() 
credit_data['dob_years'].describe()

count    21471.000000
mean        43.476643
std         12.217612
min         19.000000
25%         33.500000
50%         42.000000
75%         53.000000
max         75.000000
Name: dob_years, dtype: float64

In [16]:
#change dob_years to int
credit_data['dob_years'] = credit_data['dob_years'].astype('int64')

In [17]:
#Categorize the data: education, family_status, gender, income_type, purpose
#education
credit_data['education'].value_counts()

secondary education    13705
bachelor's degree       4710
SECONDARY EDUCATION      772
Secondary Education      711
some college             668
BACHELOR'S DEGREE        273
Bachelor's Degree        268
primary education        250
Some College              47
SOME COLLEGE              29
PRIMARY EDUCATION         17
Primary Education         15
graduate degree            4
Graduate Degree            1
GRADUATE DEGREE            1
Name: education, dtype: int64

In [18]:
#there is a lot of repetition in the education column
#convert all the rows into lowercase
credit_data['education'] = credit_data['education'].str.lower()
credit_data['education'].unique()

array(["bachelor's degree", 'secondary education', 'some college',
       'primary education', 'graduate degree'], dtype=object)

In [19]:
credit_data['family_status'].value_counts()

married              12344
civil partnership     4163
unmarried             2810
divorced              1195
widow / widower        959
Name: family_status, dtype: int64

In [20]:
credit_data['gender'].value_counts()

F      14189
M       7281
XNA        1
Name: gender, dtype: int64

In [21]:
#I'll replace the XNA with the top gender in the data
credit_data.loc[credit_data['gender'] == 'XNA', 'gender'] = credit_data.gender.describe().top

In [22]:
credit_data['income_type'].value_counts()

employee                       11091
business                        5080
retiree                         3837
civil servant                   1457
entrepreneur                       2
unemployed                         2
student                            1
paternity / maternity leave        1
Name: income_type, dtype: int64

In [23]:
#I'll take it that the student is unemployed, 
#and the paternity leave is employed(at the time of data collection, he was on leave. But generally he is employed.)
#entrepreneur belongs to business 
#So, I'll change that.
credit_data.loc[credit_data['income_type'] == 'student', 'income_type'] = 'unemployed'
credit_data.loc[credit_data['income_type'] == 'paternity / maternity leave', 'income_type'] = 'employee'
credit_data.loc[credit_data['income_type'] == 'entrepreneur', 'income_type'] = 'business'

In [24]:
#purpose: use stemming to categorize the purpose column
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
words = ['wedding', 'estate', 'housing', 'house', 'property', 'car', 'education', 'university', 'educated']
stem = []
for word in words:
    stem.append(stemmer.stem(word))
print(stem)

['wed', 'estat', 'hous', 'hous', 'properti', 'car', 'educ', 'univers', 'educ']


In [25]:
def purpose_category(purpose):
    stemmed = [stemmer.stem(word) for word in purpose.split(' ')]
    if 'wed' in stemmed:
        return 'wedding'
    if 'estat' in stemmed:
        return 'real estate'
    if 'hous' in stemmed:
        return 'real estate'
    if 'properti' in stemmed:
        return 'real estate'
    if 'car' in stemmed:
        return 'car'
    if 'educ' in stemmed:
        return 'education'
    if 'univers' in stemmed:
        return 'education'
    else:
        return 'unknown'

In [26]:
credit_data['purpose_category'] = credit_data['purpose'].apply(purpose_category)


In [27]:
credit_data.columns

Index(['children', 'days_employed', 'dob_years', 'education', 'education_id',
       'family_status', 'family_status_id', 'gender', 'income_type', 'debt',
       'total_income', 'purpose', 'purpose_category'],
      dtype='object')

In [28]:
credit_data['education'] = credit_data.education.astype('category')
credit_data['family_status'] = credit_data.family_status.astype('category')
credit_data['gender'] = credit_data.gender.astype('category')
credit_data['income_type'] = credit_data.income_type.astype('category')
credit_data['purpose_category'] = credit_data.purpose_category.astype('category')

In [29]:
#clean the children column
credit_data['children'].describe()

count    21471.000000
mean         0.539565
std          1.382978
min         -1.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         20.000000
Name: children, dtype: float64

In [30]:
#I'll ground the -1 children to 0, because it is an error that was made during input
credit_data.loc[credit_data['children'] == -1, 'children'] = 1
credit_data.loc[credit_data['children'] == 20, 'children'] = 2
credit_data['children'].value_counts()

0    14107
1     4856
2     2128
3      330
4       41
5        9
Name: children, dtype: int64

In [31]:
def child_group(children):
    if children == 0:
        return 'None'
    if 1 <= children <= 2:
        return 'Many'
    if 3 <= children <= 4:
        return 'Many More'
    else:
        return 'Most'
print(child_group(2))

Many


In [32]:
credit_data['child_group'] = credit_data['children'].apply(child_group)

In [33]:
credit_data.sample(2)

Unnamed: 0,children,days_employed,dob_years,education,education_id,family_status,family_status_id,gender,income_type,debt,total_income,purpose,purpose_category,child_group
11164,0,-1051,40,secondary education,1,married,0,M,employee,0,16276,housing renovation,real estate,
5286,0,-1636,37,bachelor's degree,0,married,0,M,business,0,147343,purchase of a car,car,


In [34]:
credit_data['debt'].value_counts()

0    19730
1     1741
Name: debt, dtype: int64

In [35]:
def debt_group(debt):
    if debt == 0:
        return 'paid'
    else:
        return 'defaulted'

In [36]:
credit_data['debt_group'] = credit_data['debt'].apply(debt_group)

In [37]:
credit_data['debt_group'] = credit_data['debt_group'].astype('category')
credit_data['child_group'] = credit_data['child_group'].astype('category')

In [38]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21471 entries, 0 to 21470
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   children          21471 non-null  int64   
 1   days_employed     21471 non-null  int64   
 2   dob_years         21471 non-null  int64   
 3   education         21471 non-null  category
 4   education_id      21471 non-null  int64   
 5   family_status     21471 non-null  category
 6   family_status_id  21471 non-null  int64   
 7   gender            21471 non-null  category
 8   income_type       21471 non-null  category
 9   debt              21471 non-null  int64   
 10  total_income      21471 non-null  int64   
 11  purpose           21471 non-null  object  
 12  purpose_category  21471 non-null  category
 13  child_group       21471 non-null  category
 14  debt_group        21471 non-null  category
dtypes: category(7), int64(7), object(1)
memory usage: 1.5+ MB


In [39]:
#create dictionaries
#which dictionaries you've selected for this data set and why.
#maybe create a dictionary and use pivot tables

In [40]:
#Task 3
#Answer the questions.

In [85]:
#Is there a connection between having kids and repaying a loan on time?
#credit_by_children = credit_data.groupby('child_group')['debt_group']
#there's no correlation between having kids and paying loans on time, 
#because the most paid loans are for clients without children.
#print(credit_by_children.count())
#print(credit_by_children.value_counts())
children = credit_data.pivot_table(index=['child_group'], values='debt', columns='debt_group', aggfunc=['count'])

#type(children)
children.columns = ['defaulted', 'paid']
children.drop('defaulted', axis=1, inplace=True)
#children.columns
children['total'] = credit_data.groupby('child_group')['debt_group'].count()
children['%paid'] = (children['paid'] / children['total']) * 100
print(children.sort_values(by='%paid', ascending=False))

                paid  total       %paid
child_group                            
Most             9.0      9  100.000000
None         13044.0  14107   92.464734
Many More      340.0    371   91.644205
Many          6337.0   6984   90.735968


In [86]:
#Is there a connection between marital status and repaying a loan on time?
credit_by_family = credit_data.groupby('family_status')['debt_group'].count()
#yes, there is. Married people pay their loans more than any other group
#print(credit_by_family.count())
#print(credit_by_family.value_counts())
family = credit_data.pivot_table(index='family_status', values='debt', columns='debt_group', aggfunc='count')
family.columns = ['defaulted', 'paid']
family.drop('defaulted', axis=1, inplace=True)
family['total'] = credit_data.groupby('family_status')['debt_group'].count()
family['%paid'] = (family['paid'] / family['total']) * 100
print(family.sort_values(by='%paid', ascending=False))

                    paid  total      %paid
family_status                             
widow / widower      896    959  93.430657
divorced            1110   1195  92.887029
married            11413  12344  92.457874
civil partnership   3775   4163  90.679798
unmarried           2536   2810  90.249110


In [90]:
#Is there a connection between income level and repaying a loan on time?
credit_by_income = credit_data.groupby('income_type')['debt_group']
#print(credit_by_income.count())
#print(credit_by_income.value_counts())
income = credit_data.pivot_table(index='income_type', values='debt', columns='debt_group', aggfunc='count')
income.columns = ['defaulted', 'paid']
income.drop('defaulted', axis=1, inplace=True)
income['total'] = credit_data.groupby('income_type')['debt_group'].count()
income['%paid'] = (income['paid'] / income['total']) * 100
print(income.sort_values(by='%paid', ascending=False))

                paid  total      %paid
income_type                           
retiree         3621   3837  94.370602
civil servant   1371   1457  94.097461
business        4706   5082  92.601338
employee       10030  11092  90.425532
unemployed         2      3  66.666667


In [47]:
#employees pay their loans on time, followed by business people

In [94]:
#How do different loan purposes affect timely loan repayment?
credit_by_purpose = credit_data.groupby('purpose_category')['debt_group']
#print(credit_by_purpose.count())
#print(credit_by_purpose.value_counts())
purposes = credit_data.pivot_table(index='purpose_category', values='debt', columns='debt_group', aggfunc='count')
purposes.columns = ['defaulted', 'paid']
purposes.drop('defaulted', axis=1, inplace=True)
purposes['total'] = credit_data.groupby('purpose_category')['debt_group'].count()
purposes['%paid'] = (purposes['paid'] / purposes['total']) * 100
print(purposes.sort_values(by='%paid', ascending=False))

                   paid  total      %paid
purpose_category                         
real estate       10032  10814  92.768633
wedding            2149   2335  92.034261
education          3644   4014  90.782262
car                3905   4308  90.645311


In [51]:
purpose_dict = credit_data[['purpose_category', 'debt_group']]
