## 0 Import Libraries

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn import metrics as mt
import statsmodels.api as sm
import statsmodels.formula.api as smf

### 0.1 Functions

In [3]:
def train (x_train,y_train):
    #model definition
    lr_model = LinearRegression()
    #model fit
    lr_model.fit(x_train, y_train)
    #predict
    y_pred = lr_model.predict(x_train)
    
    return y_pred

def simulation_outliers(outliers):
    y_train2 = y_train.copy()
    count = 0
    r2_score = []
    mse_score = []
    rmse_score = []
    
    
    for i in outliers:
        y_train2.iloc[count:int(len(y_train2)*i),:] = (lambda x : x*20 )(y_train2.iloc[count:int(len(y_train2)*i),:])
        pred = train(x_train,y_train2)
        r2_score.append(mt.r2_score(y_train2,pred))
        mse_score.append(mt.mean_squared_error(y_train2,pred))
        rmse_score.append(pow(mt.mean_squared_error(y_train2,pred),1/2))
        count = int(len(y_train2)*i)
    result_raw ={'Outliers':outliers,
                 'R2'   : r2_score,
                 'MSE'  : mse_score,
                 'RMSE' : rmse_score}
    result = pd.DataFrame.from_dict(result_raw)
    return result

## 1.0 LOAD DATASET

In [4]:
dataset_path = 'train.csv'
df = pd.read_csv(dataset_path)

In [5]:
df.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


## 2.0 DATA PREPARATION

In [6]:
df.columns

Index(['id_cliente', 'idade', 'saldo_atual', 'divida_atual', 'renda_anual',
       'valor_em_investimentos', 'taxa_utilizacao_credito', 'num_emprestimos',
       'num_contas_bancarias', 'num_cartoes_credito', 'dias_atraso_dt_venc',
       'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros',
       'investe_exterior', 'pessoa_polit_exp', 'limite_adicional'],
      dtype='object')

In [7]:
# features = ['idade', 'divida_atual', 'renda_anual',
#             'valor_em_investimentos', 'taxa_utilizacao_credito', 'num_emprestimos',
#             'num_contas_bancarias', 'num_cartoes_credito', 'dias_atraso_dt_venc',
#             'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']
features = ['idade', 'divida_atual', 'renda_anual','num_pgtos_atrasados', 'num_consultas_credito']
label = ['saldo_atual']

In [8]:
x_train = df.loc[:,features]
y_train = df.loc[:,label]

## 3.0 MODEL TRAINING

### 3.1 MODEL TRAINING - SKLearn

In [9]:
#model definition
lr_model = LinearRegression()
#model fit
lr_model.fit(x_train, y_train)
#predict
y_pred = lr_model.predict(x_train)

In [10]:
df1 = df.loc[:,['id_cliente','saldo_atual']]
df1['predicted'] = y_pred
df1.head()

Unnamed: 0,id_cliente,saldo_atual,predicted
0,1767,278.172008,335.734457
1,11920,268.874152,342.39096
2,8910,446.643127,426.396661
3,4964,321.141267,447.399195
4,10100,428.716114,436.240392


### 3.2 MODEL TRAINING - Stats Model

In [11]:
df = pd.concat([x_train, y_train], axis=1)
df

Unnamed: 0,idade,divida_atual,renda_anual,num_pgtos_atrasados,num_consultas_credito,saldo_atual
0,21,2577.05,24196.89636,14,9,278.172008
1,40,2465.39,19227.37796,23,10,268.874152
2,36,1055.29,42822.28223,13,3,446.643127
3,58,703.05,51786.82600,7,2,321.141267
4,35,891.29,44626.85346,10,8,428.716114
...,...,...,...,...,...,...
9495,29,157.98,32624.67754,15,1,157.500279
9496,1237,805.43,49024.15700,11,2,497.714090
9497,47,2250.42,42200.88978,8,4,306.557684
9498,42,505.37,35391.32289,13,3,209.870718


In [12]:
lr_model_stats = smf.ols(formula= 'saldo_atual ~ idade + divida_atual + renda_anual + num_pgtos_atrasados + num_consultas_credito', data = df)
lr_model_stats = lr_model_stats.fit()
lr_model_stats.summary()

0,1,2,3
Dep. Variable:,saldo_atual,R-squared:,0.101
Model:,OLS,Adj. R-squared:,0.1
Method:,Least Squares,F-statistic:,212.4
Date:,"Mon, 31 Jul 2023",Prob (F-statistic):,2.1900000000000001e-215
Time:,12:02:32,Log-Likelihood:,-64115.0
No. Observations:,9500,AIC:,128200.0
Df Residuals:,9494,BIC:,128300.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,489.0628,3.431,142.523,0.000,482.336,495.789
idade,-1.208e-05,0.003,-0.004,0.997,-0.006,0.006
divida_atual,-0.0596,0.002,-32.502,0.000,-0.063,-0.056
renda_anual,3.761e-06,1.46e-06,2.581,0.010,9.05e-07,6.62e-06
num_pgtos_atrasados,0.0010,0.010,0.100,0.920,-0.018,0.020
num_consultas_credito,0.0136,0.012,1.175,0.240,-0.009,0.036

0,1,2,3
Omnibus:,2662.068,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7308.947
Skew:,1.493,Prob(JB):,0.0
Kurtosis:,6.09,Cond. No.,2380000.0


## 4.0 PERFORMANCE

### 4.1 R squared

In [13]:
r2 = mt.r2_score(df1['saldo_atual'],df1['predicted'])
r2

0.10058677119960424

### 4.2 MSE

In [14]:
mse = mt.mean_squared_error(df1['saldo_atual'],df1['predicted'])
mse

42620.38267708075

### 4.3 RMSE

In [15]:
rmse = pow(mse,1/2)
rmse

206.44704569714906

## 5.0 SIMULATION OUTLIERS

In [16]:
outliers = [0,0.05,0.1,0.15,0.2]
result = simulation_outliers(outliers)
result

Unnamed: 0,Outliers,R2,MSE,RMSE
0,0.0,0.100587,42620.38,206.447046
1,0.05,0.003631,3677484.0,1917.676671
2,0.1,0.004471,7011231.0,2647.873014
3,0.15,0.004858,10004360.0,3162.966495
4,0.2,0.00747,13009010.0,3606.800801
