# German Data Pre-processing - Solution

## 1. Import Pandas

In [18]:
import pandas as pd

# 2. Import Data

In [128]:
dataframe = pd.read_csv("data/german.data.csv")

# 3. Explore Data

In [129]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
A11     999 non-null object
6       999 non-null int64
A34     999 non-null object
A43     999 non-null object
1169    999 non-null int64
A65     999 non-null object
A75     999 non-null object
4       999 non-null int64
A93     999 non-null object
A101    999 non-null object
4.1     999 non-null int64
A121    999 non-null object
67      999 non-null int64
A143    999 non-null object
A152    999 non-null object
2       999 non-null int64
A173    999 non-null object
1       999 non-null int64
A192    999 non-null object
A201    999 non-null object
1.1     999 non-null int64
dtypes: int64(8), object(13)
memory usage: 164.0+ KB


# 4. Change Column Names

In [130]:
dataframe.columns = ["status_checking", "loan_duration", "credit_history", "purpose", "credit_amount", "savings_amt", "emp_years", "percent_disp_income", "status_sex", "other_debts", "residence_since", "property", "age", "other_installments", "housing_status", "num_credits", "job_type", "num_dependant", "own_telephone", "foreign_worker", "credit_risk"]

In [131]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
status_checking        999 non-null object
loan_duration          999 non-null int64
credit_history         999 non-null object
purpose                999 non-null object
credit_amount          999 non-null int64
savings_amt            999 non-null object
emp_years              999 non-null object
percent_disp_income    999 non-null int64
status_sex             999 non-null object
other_debts            999 non-null object
residence_since        999 non-null int64
property               999 non-null object
age                    999 non-null int64
other_installments     999 non-null object
housing_status         999 non-null object
num_credits            999 non-null int64
job_type               999 non-null object
num_dependant          999 non-null int64
own_telephone          999 non-null object
foreign_worker         999 non-null object
credit_risk            999 non-null int64


# 5. Change Column Types to Categorical

In [16]:
for c in dataframe.columns:
    if dataframe[c].dtype == "O":
        dataframe[c] = dataframe[c].astype("category")

In [115]:
dataframe.credit_risk = dataframe.credit_risk.astype("category")

In [116]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
status_checking        999 non-null category
loan_duration          999 non-null int64
credit_history         999 non-null category
purpose                999 non-null category
credit_amount          999 non-null int64
savings_amt            999 non-null category
emp_years              999 non-null category
percent_disp_income    999 non-null int64
status_sex             999 non-null category
other_debts            999 non-null category
residence_since        999 non-null int64
property               999 non-null category
age                    999 non-null int64
other_installments     999 non-null category
housing_status         999 non-null category
num_credits            999 non-null int64
job_type               999 non-null category
num_dependant          999 non-null int64
own_telephone          999 non-null category
foreign_worker         999 non-null category
credit_risk     

# 6. Split Data to training and testing set

In [117]:
train=dataframe.sample(frac=0.7,random_state=200) #random state is a seed value
test=dataframe.drop(train.index)

In [118]:
train.shape

(699, 21)

In [119]:
test.shape

(300, 21)

In [120]:
df = train[train.credit_risk== 1]
dfb = train[train.credit_risk== 2]

In [123]:
df.shape

(479, 21)

In [124]:
dfb.shape

(220, 21)

# 7. Duplicate rows with bad credit score 5 times

We want the model to be trained to be more sensitive to bad credit

In [125]:
for i in range(5):
    df = pd.concat([df, dfb], ignore_index=True)

In [126]:
df.shape

(1579, 21)

In [127]:
df

Unnamed: 0,status_checking,loan_duration,credit_history,purpose,credit_amount,savings_amt,emp_years,percent_disp_income,status_sex,other_debts,...,property,age,other_installments,housing_status,num_credits,job_type,num_dependant,own_telephone,foreign_worker,credit_risk
0,A14,30,A34,A43,4530,A61,A74,4,A92,A101,...,A123,26,A143,A151,1,A174,1,A192,A201,1
1,A14,4,A34,A43,1544,A61,A74,2,A93,A101,...,A121,42,A143,A152,3,A172,2,A191,A201,1
2,A11,24,A31,A40,2325,A62,A74,2,A93,A101,...,A123,32,A141,A152,1,A173,1,A191,A201,1
3,A11,18,A32,A43,2389,A61,A72,4,A92,A101,...,A123,27,A142,A152,1,A173,1,A191,A201,1
4,A12,18,A32,A43,1113,A61,A73,4,A92,A103,...,A121,26,A143,A152,1,A172,2,A191,A201,1
5,A13,9,A32,A43,1126,A62,A75,2,A91,A101,...,A121,49,A143,A152,1,A173,1,A191,A201,1
6,A14,30,A34,A43,5954,A61,A74,3,A93,A102,...,A123,38,A143,A152,1,A173,1,A191,A201,1
7,A12,24,A32,A43,3758,A63,A71,1,A92,A101,...,A124,23,A143,A151,1,A171,1,A191,A201,1
8,A14,12,A32,A43,707,A61,A73,4,A93,A101,...,A121,30,A141,A152,2,A173,1,A191,A201,1
9,A11,24,A32,A49,6568,A61,A73,2,A94,A101,...,A123,21,A142,A152,1,A172,1,A191,A201,1
