In [147]:
import pandas as pd
import numpy as np

In [148]:
#Read the data
credit_data = pd.read_csv('credit_scoring_eng.csv')
credit_data.head()

Unnamed: 0,children,days_employed,dob_years,education,education_id,family_status,family_status_id,gender,income_type,debt,total_income,purpose
0,1,-8437.673028,42,bachelor's degree,0,married,0,F,employee,0,40620.102,purchase of the house
1,1,-4024.803754,36,secondary education,1,married,0,F,employee,0,17932.802,car purchase
2,0,-5623.42261,33,Secondary Education,1,married,0,M,employee,0,23341.752,purchase of the house
3,3,-4124.747207,32,secondary education,1,married,0,M,employee,0,42820.568,supplementary education
4,0,340266.072047,53,secondary education,1,civil partnership,1,F,retiree,0,25378.572,to have a wedding


In [149]:
#Look at the data types and nulls of columns in the DataFrame
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     19351 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      19351 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB


In [150]:
#What is the shape of the DataFrame?
credit_data.shape

(21525, 12)

In [151]:
#I'll drop the 'days_employed' column since it will not provide any information in my analysis.
#credit_data.drop(['days_employed'], axis=1, inplace=True)

In [152]:
#Identify and fill in missing values
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     19351 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      19351 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB


In [153]:
#count number of nulls in every column
credit_data.isnull().sum()
#total_income has 2174 null values.

children               0
days_employed       2174
dob_years              0
education              0
education_id           0
family_status          0
family_status_id       0
gender                 0
income_type            0
debt                   0
total_income        2174
purpose                0
dtype: int64

In [154]:
#I assume that null values in the total_income and days_employed are not missing by random.*Check notes to explain further
#They are related, meaning no employment.
#I'll fill in the null values with 0 in the total_income and days_employed columns
credit_data.fillna(value=0, axis=1, inplace=True)
credit_data.isnull().sum()

children            0
days_employed       0
dob_years           0
education           0
education_id        0
family_status       0
family_status_id    0
gender              0
income_type         0
debt                0
total_income        0
purpose             0
dtype: int64

In [155]:
#Replace floats with integer data type
print(credit_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21525 entries, 0 to 21524
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   children          21525 non-null  int64  
 1   days_employed     21525 non-null  float64
 2   dob_years         21525 non-null  int64  
 3   education         21525 non-null  object 
 4   education_id      21525 non-null  int64  
 5   family_status     21525 non-null  object 
 6   family_status_id  21525 non-null  int64  
 7   gender            21525 non-null  object 
 8   income_type       21525 non-null  object 
 9   debt              21525 non-null  int64  
 10  total_income      21525 non-null  float64
 11  purpose           21525 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 2.0+ MB
None


In [156]:
#I'll relace the columns with type 'float64' to type 'int64' using np.int64
#days_employed to int
credit_data['days_employed'] = credit_data['days_employed'].astype(np.int64)

In [157]:
#total_income to int
credit_data['total_income'] = credit_data['total_income'].astype(np.int64)

In [158]:
#Lookin at duplicated rows in the data
print(credit_data.duplicated().sum())

54


In [159]:
#drop duplicate rows
#I use the keyword argument 'ignore_index', so that i dont have to reset the index
credit_data.drop_duplicates(inplace=True, ignore_index=True)

In [160]:
#the shape of my DataFrame has changed because of the duplicate rows that have been removed.
credit_data.shape

(21471, 12)

In [161]:
#Categorize the data: education, family_status, gender, income_type, purpose
#education
credit_data['education'].value_counts()

secondary education    13705
bachelor's degree       4710
SECONDARY EDUCATION      772
Secondary Education      711
some college             668
BACHELOR'S DEGREE        273
Bachelor's Degree        268
primary education        250
Some College              47
SOME COLLEGE              29
PRIMARY EDUCATION         17
Primary Education         15
graduate degree            4
GRADUATE DEGREE            1
Graduate Degree            1
Name: education, dtype: int64

In [162]:
#there is a lot of repetition in the education column
#convert all the rows into lowercase
credit_data['education'] = credit_data['education'].str.lower()
credit_data['education'].unique()

array(["bachelor's degree", 'secondary education', 'some college',
       'primary education', 'graduate degree'], dtype=object)

In [163]:
credit_data['family_status'].value_counts()

married              12344
civil partnership     4163
unmarried             2810
divorced              1195
widow / widower        959
Name: family_status, dtype: int64

In [164]:
credit_data['gender'].value_counts()

F      14189
M       7281
XNA        1
Name: gender, dtype: int64

In [165]:
#I'll replace the XNA with the top gender in the data
credit_data.loc[credit_data['gender'] == 'XNA', 'gender'] = credit_data.gender.describe().top

In [166]:
credit_data['income_type'].value_counts()

employee                       11091
business                        5080
retiree                         3837
civil servant                   1457
unemployed                         2
entrepreneur                       2
student                            1
paternity / maternity leave        1
Name: income_type, dtype: int64

In [167]:
#I'll take it that the student is unemployed, 
#and the paternity leave is employed(at the time of data collection, he was on leave. But generally he is employed.)
#entrepreneur belongs to business 
#So, I'll change that.
credit_data.loc[credit_data['income_type'] == 'student', 'income_type'] = 'unemployed'
credit_data.loc[credit_data['income_type'] == 'paternity / maternity leave', 'income_type'] = 'employee'
credit_data.loc[credit_data['income_type'] == 'entrepreneur', 'income_type'] = 'business'

In [168]:
#purpose: use stemming to categorize the purpose column
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
words = ['wedding', 'estate', 'housing', 'house', 'property', 'car', 'education', 'university', 'educated']
stem = []
for word in words:
    stem.append(stemmer.stem(word))
print(stem)

['wed', 'estat', 'hous', 'hous', 'properti', 'car', 'educ', 'univers', 'educ']


In [169]:
def purpose_category(purpose):
    stemmed = [stemmer.stem(word) for word in purpose.split(' ')]
    if 'wed' in stemmed:
        return 'wedding'
    if 'estat' in stemmed:
        return 'real estate'
    if 'hous' in stemmed:
        return 'real estate'
    if 'properti' in stemmed:
        return 'real estate'
    if 'car' in stemmed:
        return 'car'
    if 'educ' in stemmed:
        return 'education'
    if 'univers' in stemmed:
        return 'education'
    else:
        return 'unknown'

In [170]:
credit_data['purpose_category'] = credit_data['purpose'].apply(purpose_category)


In [171]:
credit_data.columns

Index(['children', 'days_employed', 'dob_years', 'education', 'education_id',
       'family_status', 'family_status_id', 'gender', 'income_type', 'debt',
       'total_income', 'purpose', 'purpose_category'],
      dtype='object')

In [179]:
credit_data['education'] = credit_data.education.astype('category')
credit_data['family_status'] = credit_data.family_status.astype('category')
credit_data['gender'] = credit_data.gender.astype('category')
credit_data['income_type'] = credit_data.income_type.astype('category')
credit_data['purpose_category'] = credit_data.purpose_category.astype('category')

In [186]:
#clean the children column

In [None]:
#create dictionaries
#for number of children and age
#which dictionaries you've selected for this data set and why.

In [None]:
#Task 3
#Answer the questions.