In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math as m


account_df = pd.read_csv('../data/account.csv',sep=';')
card_df = pd.read_csv('../data/card_dev.csv', sep=';')
client_df = pd.read_csv('../data/client.csv',sep=';')
disp_df = pd.read_csv('../data/disp.csv',sep=';')
district_df = pd.read_csv('../data/district.csv',sep=';')
loan_df = pd.read_csv('../data/loan_dev.csv', sep=';')
trans_df = pd.read_csv("../data/trans_dev.csv",sep=";", low_memory=False)

# Process Client

In [2]:
client_df['sex'] = client_df['birth_number'].apply(lambda x : 0 if int(str(x)[2:4]) < 50 else 1)

# Process Account

In [3]:
month_frequency = [x for x in account_df['frequency'].unique()]
account_df['frequency'] = account_df['frequency'].apply(lambda x : month_frequency.index(x))

# Process Disp

In [4]:
type_frequency = [x for x in disp_df['type'].unique()]
disp_df['type'] = disp_df['type'].apply(lambda x : type_frequency.index(x)) 

# Process Card


In [5]:
type_frequency = [x for x in card_df['type'].unique()]
card_df['type'] = card_df['type'].apply(lambda x : type_frequency.index(x))

# Date Format

In [6]:
account_df['date'] = pd.to_datetime(account_df['date'], format = '%y%m%d', errors='coerce');
account_df.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,0,1993-01-01
1,3818,74,0,1993-01-01
2,704,55,0,1993-01-01
3,2378,16,0,1993-01-01
4,2632,24,0,1993-01-02


In [7]:
card_df['date'] = pd.to_datetime(card_df['issued'], format = '%y%m%d', errors='coerce');
card_df.head()

Unnamed: 0,card_id,disp_id,type,issued,date
0,1005,9285,0,931107,1993-11-07
1,104,588,0,940119,1994-01-19
2,747,4915,0,940205,1994-02-05
3,70,439,0,940208,1994-02-08
4,577,3687,0,940215,1994-02-15


# Clean column names

In [8]:
district_df.rename(columns = {'code ':'code', 'name ':'name', 
                            'no. of municipalities with inhabitants < 499 ': 'no of municipalities with inhabitants < 499',
                            'no. of municipalities with inhabitants 2000-9999 ': 'no. of municipalities with inhabitants 2000-9999', 
                            'no. of municipalities with inhabitants >10000 ': 'no. of municipalities with inhabitants >10000',
                            'no. of cities ':'no. of cities', 'ratio of urban inhabitants ':'ratio of urban inhabitants',
                            'average salary ':'average salary', "unemploymant rate '95 ": "unemploymant rate 95",
                            "unemploymant rate '96 ": "unemploymant rate 96 ", 
                            'no. of enterpreneurs per 1000 inhabitants ':'no. of enterpreneurs per 1000 inhabitants', 
                            "no. of commited crimes '95 ":"no. of commited crimes 95",
                           "no. of commited crimes '96 ": "no. of commited crimes 96"}, inplace = True)

district_df.head()

Unnamed: 0,code,name,region,no. of inhabitants,no of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate 95,unemploymant rate 96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes 95,no. of commited crimes 96
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


# Clean Null values

In [9]:
district_df.dtypes

code                                                  int64
name                                                 object
region                                               object
no. of inhabitants                                    int64
no of municipalities with inhabitants < 499           int64
no. of municipalities with inhabitants 500-1999       int64
no. of municipalities with inhabitants 2000-9999      int64
no. of municipalities with inhabitants >10000         int64
no. of cities                                         int64
ratio of urban inhabitants                          float64
average salary                                        int64
unemploymant rate 95                                 object
unemploymant rate 96                                float64
no. of enterpreneurs per 1000 inhabitants             int64
no. of commited crimes 95                            object
no. of commited crimes 96                             int64
dtype: object

In [21]:
condition = district_df['no. of commited crimes 95'] != '?'
meanNoCommitedCrimes95 = int(pd.to_numeric(district_df['no. of commited crimes 95'][condition]).mean())

condition = district_df['no. of commited crimes 95'] == '?'
district_df.loc[district_df['no. of commited crimes 95'] == '?', 'no. of commited crimes 95'] = meanNoCommitedCrimes95 

district_df[["unemploymant rate 95"]] = district_df[["unemploymant rate 95"]].apply(pd.to_numeric, errors='coerce', axis=1)
district_df.dtypes

    code             name         region  no. of inhabitants  \
67    68  Frydek - Mistek  north Moravia              228848   
68    69          Jesenik  north Moravia               42821   
69    70          Karvina  north Moravia              285387   
70    71       Novy Jicin  north Moravia              161227   
71    72          Olomouc  north Moravia              226122   
72    73            Opava  north Moravia              182027   
73    74  Ostrava - mesto  north Moravia              323870   
74    75           Prerov  north Moravia              138032   
75    76          Sumperk  north Moravia              127369   
76    77           Vsetin  north Moravia              148545   

    no of municipalities with inhabitants < 499  \
67                                           15   
68                                            4   
69                                            0   
70                                            5   
71                                      

code                                                  int64
name                                                 object
region                                               object
no. of inhabitants                                    int64
no of municipalities with inhabitants < 499           int64
no. of municipalities with inhabitants 500-1999       int64
no. of municipalities with inhabitants 2000-9999      int64
no. of municipalities with inhabitants >10000         int64
no. of cities                                         int64
ratio of urban inhabitants                          float64
average salary                                        int64
unemploymant rate 95                                float64
unemploymant rate 96                                float64
no. of enterpreneurs per 1000 inhabitants             int64
no. of commited crimes 95                             int64
no. of commited crimes 96                             int64
dtype: object

###### 