In [152]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import numpy as np

In [2]:
# import the dataset
df = pd.read_csv("data/german_data_clean.csv")

In [3]:
# view the dataset
df.head()

Unnamed: 0,checking_account,duration_months,credit_history,purpose,credit_amount,savings,present_employment_since,installment_rate,personal_status_sex,other_deptors,...,property,age_years,other_installment_plans,housing,existing_credits,job,people_liable_maintenance,telephone,foreign_worker,response
0,... < 0 euro,6,critical account/other credits existing (not a...,radio/television,1169,unknown/ no savings account,.. >= 7 years,4,male : single,none,...,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes,0
1,0 <= ... < 102 euro,48,existing credits paid back duly till now,radio/television,5951,... < 51 euro,1 <= ... < 4 years,2,female : divorced/separated/married,none,...,real estate,22,none,own,1,skilled employee / official,1,none,yes,1
2,No checking account,12,critical account/other credits existing (not a...,education,2096,... < 51 euro,4 <= ... < 7 years,2,male : single,none,...,real estate,49,none,own,1,unskilled - resident,2,none,yes,0
3,... < 0 euro,42,existing credits paid back duly till now,furniture/equipment,7882,... < 51 euro,4 <= ... < 7 years,2,male : single,guarantor,...,(if not real estate) building society savings ...,45,none,for free,1,skilled employee / official,2,none,yes,0
4,... < 0 euro,24,delay in paying off in the past,car (new),4870,... < 51 euro,1 <= ... < 4 years,3,male : single,none,...,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,1


In [182]:
# define the X and y columns
x_columns = ['age_years', 'duration_months', 'installment_rate', 'people_liable_maintenance', 'existing_credits']
y_column = 'credit_amount'

In [183]:
# get the data for X and y from the DataFrame
X = df[x_columns]
y = df[y_column].values

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=999)

In [185]:
X_test

Unnamed: 0,age_years,duration_months,installment_rate,people_liable_maintenance,existing_credits
842,23,18,4,1,1
68,37,36,4,1,1
308,24,8,3,1,1
881,48,24,2,1,1
350,23,9,1,1,1
...,...,...,...,...,...
838,63,24,4,1,2
301,42,36,4,1,1
507,34,15,1,2,1
264,32,10,3,2,2


In [186]:
# apply the data to the algorithm --> Training/Fitting
model = LinearRegression()
model.fit(X_train, y_train)

In [187]:
# generate predictions and assign it to a variable
predictions = model.predict(X_test)

In [188]:
# check the number of predictions
len(predictions)

300

In [189]:
# create a new DataFrame
df_preds = pd.DataFrame()

In [190]:
# add a prediction column and actual column for later comparison
df_preds['prediction'] = predictions
df_preds['actual'] = y_test

In [191]:
# show the first few rows of the DataFrame
df_preds.head()

Unnamed: 0,prediction,actual
0,1800.011975,1943
1,4651.485614,1819
2,1157.785103,1237
3,4676.64243,9277
4,2905.719558,1236


In [192]:
# Custom comparison
df_preds['difference'] = df_preds['prediction'] - df_preds['actual'] 
df_preds.head()

Unnamed: 0,prediction,actual,difference
0,1800.011975,1943,-142.988025
1,4651.485614,1819,2832.485614
2,1157.785103,1237,-79.214897
3,4676.64243,9277,-4600.35757
4,2905.719558,1236,1669.719558


In [193]:
# Apply metrics to review the quality of the model
model_mae = mean_absolute_error(df_preds['actual'], df_preds['prediction'])
model_mae

1404.975668568643

In [194]:
# mape
model_mape = mean_absolute_percentage_error(df_preds['actual'], df_preds['prediction'])
model_mape

0.5865864776736353

In [195]:
# mean squared error
model_mse = mean_squared_error(df_preds['actual'], df_preds['prediction'])
model_mse

4508432.082060043

In [196]:
model_rmse = model_mse ** 0.5
model_rmse

2123.306874208258

In [197]:
df_preds['actual'].mean()

3321.8