In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import datetime

#Data file paths
application_path = "Data_Original/application_record.csv"
credit_path = "Data_Original/credit_record.csv"

#Read the data
application = pd.read_csv(application_path)
credit = pd.read_csv(credit_path)

#Combine the data into a single DataFrame
data_merge = pd.merge(application, credit, on = "ID")
#Display the data table for preview
data_merge

In [None]:
#Dropping uneeded columns
drop_cols = ["FLAG_WORK_PHONE", "FLAG_PHONE","FLAG_EMAIL"]
data_merge.drop(drop_cols, axis = 1, inplace = True)
data_merge

In [None]:
#Filling certain blank results in our data
data_merge.loc[data_merge['NAME_INCOME_TYPE'] == 'Pensioner', 'OCCUPATION_TYPE'] = data_merge.loc[data_merge['NAME_INCOME_TYPE'] == 'Pensioner', 'OCCUPATION_TYPE'].fillna('Pensioner')
data_merge

In [None]:
#Converting negative numbers to positive numbers
neg_convert = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']
data_merge[neg_convert] = data_merge[neg_convert].abs()
data_merge

In [None]:
#Replacing error values in our data
data_merge["DAYS_EMPLOYED"] = data_merge["DAYS_EMPLOYED"].replace(365243, 0)
data_merge

In [None]:
#Looping through DAYS_EMPLOYED column to convert the numbers into a count of how many years old they are
for index,rows in data_merge.iterrows():
    data_merge.at[index, 'DAYS_BIRTH']/=365
data_merge

In [None]:
#Casting DAYS_BIRTH column as integer
data_merge["DAYS_BIRTH"] = data_merge["DAYS_BIRTH"].astype(int)
data_merge

In [None]:
#Filling any Null results in the OCCUPATION_TYPE column with Not Specified
data_merge['OCCUPATION_TYPE'] = data_merge['OCCUPATION_TYPE'].fillna("Not Specified")
data_merge

In [None]:
#Looping through DAYS_EMPLOYED column to convert the numbers into a count of how many months
for index,rows in data_merge.iterrows():
    data_merge.at[index, 'DAYS_EMPLOYED']/=30.44
data_merge

In [None]:
#Casting DAYS_EMPLOYED column as integer
data_merge["DAYS_EMPLOYED"] = data_merge["DAYS_EMPLOYED"].astype(int)
data_merge

In [None]:
#Recategorising the STATUS column
data_merge["STATUS"] = data_merge["STATUS"].replace({"0": 'Minor Over',
                                                     "1": 'Minor Over',
                                                     "2": 'Minor Over',
                                                     "3": 'Minor Over',
                                                     "4": 'Minor Over',
                                                     "5": 'Major Over',
                                                     'C': 'Paid',
                                                     'X': 'No Loan Taken'})
data_merge

In [None]:
#Renaming column headers
data_merge = data_merge.rename(columns={"CODE_GENDER": "Gender",
                                        "FLAG_OWN_CAR": "Owns a Car",
                                        "FLAG_OWN_REALTY": "Owns Property", 
                                        "CNT_CHILDREN": "Number of Children", 
                                        "AMT_INCOME_TOTAL": "Annual Income", 
                                        "NAME_INCOME_TYPE": "Income Category",
                                        "NAME_EDUCATION_TYPE": "Education Level",
                                        "NAME_FAMILY_STATUS": "Marital Status", 
                                        "NAME_HOUSING_TYPE": "Houseing Type", 
                                        "DAYS_BIRTH": "Age", 
                                        "DAYS_EMPLOYED": "Months Employed", 
                                        "FLAG_MOBIL": "Owns a Mobile", 
                                        "OCCUPATION_TYPE": "Occupation", 
                                        "CNT_FAM_MEMBERS": "Family Members", 
                                        "MONTHS_BALANCE": "Months Since Payment", 
                                        "STATUS": "Status"})
data_merge

In [None]:
#Exporting the cleaned dataframe out as a csv
data_merge.to_csv("data_clean.csv")