# Linear Regression

## Imports

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics as mt
from sklearn import linear_model as lm
import numpy as np
from matplotlib import pyplot as plt

## Load datasets

In [3]:
x_train = pd.read_csv('./X_training.csv')
x_test = pd.read_csv('./X_test.csv')
x_val = pd.read_csv('./X_validation.csv')
y_train = pd.read_csv('./y_training.csv')
y_test = pd.read_csv('./y_test.csv')
y_val = pd.read_csv('./y_val.csv')


## Model training

### training dataset

In [29]:
# model
# Linear regression
lr = lm.LinearRegression()

# lasso l1 - tende a zerar os parametros
lasso = lm.Lasso(alpha= 1)

# ridge l2 - tende a suavizar os parametros
ridge = lm.Ridge(alpha= 30)

# elasticnet - L1 e L2
elastic = lm.ElasticNet(alpha=20)

# RANSAC
ransac = lm.RANSACRegressor()

# fit 
lr.fit(x_train, y_train)
lasso.fit(x_train,y_train)
ridge.fit(x_train,y_train)
elastic.fit(x_train,y_train)
ransac.fit(x_train,y_train)

# predict
yhat_linear_model = lr.predict(x_train)
yhat_lasso = lasso.predict(x_train)
yhat_ridge = ridge.predict(x_train)
yhat_elastic = elastic.predict(x_train)
yhat_ransac = ransac.predict(x_train)


In [5]:
lasso.coef_ # observar quando aumenta o alpha no lasso, as colunas que tem menos contribuição tendem a zerar

array([-0.        , -0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        ,  1.13019385, -0.        ,  0.        ,
       -0.        ,  0.        , -0.        ])

In [6]:
ridge.coef_

array([[-0.21226959, -3.48247681, 12.47457694, -9.38095792, -9.57520963,
        -0.88408915, -4.67450296,  2.45434703, -0.24556234, -1.11341609,
        -0.43673057,  4.88886295, -8.97049054]])

In [7]:
ransac.estimator_.coef_

array([[  3.54265312,   3.06874915, -30.82303977, -59.62005817,
        -81.83011999, -28.23217877, -10.47091678,   2.44835918,
        -14.60016036, 115.08191324,   6.06896222,   0.93945061,
         14.82669403]])

In [8]:
# metrics - linear regression

r2_lm = mt.r2_score(y_train, yhat_linear_model)
mse_lm = mt.mean_squared_error(y_train, yhat_linear_model)
rmse_lm = np.sqrt(mse_lm)
mae_lm = mt.mean_absolute_error(y_train, yhat_linear_model)
mape_lm = mt.mean_absolute_percentage_error(y_train, yhat_linear_model)
print(f'r2: {r2_lm:.3f}')
print(f'mse: {mse_lm:.3f}')
print(f'rmse: {rmse_lm:.3f}')
print(f'mae: {mae_lm:.3f}')
print(f'mape: {mape_lm:.3f}')

r2: 0.046
mse: 455.996
rmse: 21.354
mae: 16.998
mape: 8.653


In [9]:
# metrics - lasso

r2_lasso = mt.r2_score(y_train, yhat_lasso)
mse_lasso = mt.mean_squared_error(y_train, yhat_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mt.mean_absolute_error(y_train, yhat_lasso)
mape_lasso = mt.mean_absolute_percentage_error(y_train, yhat_lasso)
print(f'r2: {r2_lasso:.3f}')
print(f'mse: {mse_lasso:.3f}')
print(f'rmse: {rmse_lasso:.3f}')
print(f'mae: {mae_lasso:.3f}')
print(f'mape: {mape_lasso:.3f}')

r2: 0.007
mse: 474.475
rmse: 21.782
mae: 17.305
mape: 8.737


In [30]:
# metrics - ridge

r2_ridge = mt.r2_score(y_train, yhat_ridge)
mse_ridge = mt.mean_squared_error(y_train, yhat_ridge)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mt.mean_absolute_error(y_train, yhat_ridge)
mape_ridge = mt.mean_absolute_percentage_error(y_train, yhat_ridge)
print(f'r2: {r2_ridge:.3f}')
print(f'mse: {mse_ridge:.3f}')
print(f'rmse: {rmse_ridge:.3f}')
print(f'mae: {mae_ridge:.3f}')
print(f'mape: {mape_ridge:.3f}')

r2: 0.046
mse: 456.154
rmse: 21.358
mae: 17.002
mape: 8.658


In [31]:
# metrics - elastic net

r2_en = mt.r2_score(y_train, yhat_elastic)
mse_en = mt.mean_squared_error(y_train, yhat_elastic)
rmse_en = np.sqrt(mse_en)
mae_en = mt.mean_absolute_error(y_train, yhat_elastic)
mape_en = mt.mean_absolute_percentage_error(y_train, yhat_elastic)
print(f'r2: {r2_en:.3f}')
print(f'mse: {mse_en:.3f}')
print(f'rmse: {rmse_en:.3f}')
print(f'mae: {mae_en:.3f}')
print(f'mape: {mape_en:.3f}')

r2: 0.000
mse: 478.013
rmse: 21.863
mae: 17.365
mape: 8.742


In [12]:
# metrics - ransac

r2_ransac = mt.r2_score(y_train, yhat_ransac)
mse_ransac = mt.mean_squared_error(y_train, yhat_ransac)
rmse_ransac = np.sqrt(mse_ransac)
mae_ransac = mt.mean_absolute_error(y_train, yhat_ransac)
mape_ransac = mt.mean_absolute_percentage_error(y_train, yhat_ransac)
print(f'r2: {r2_ransac:.3f}')
print(f'mse: {mse_ransac:.3f}')
print(f'rmse: {rmse_ransac:.3f}')
print(f'mae: {mae_ransac:.3f}')
print(f'mape: {mape_ransac:.3f}')

r2: -1.265
mse: 1082.860
rmse: 32.907
mae: 25.999
mape: 9.014


### Validation dataset

In [32]:
# model
# Linear regression
lr = lm.LinearRegression()

# lasso l1 - tende a zerar os parametros
lasso = lm.Lasso(alpha= 1)

# ridge l2 - tende a suavizar os parametros
ridge = lm.Ridge(alpha= 20)

# elasticnet - L1 e L2
elastic = lm.ElasticNet(alpha=20)

# RANSAC
ransac = lm.RANSACRegressor()

# fit 
lr.fit(x_train, y_train)
lasso.fit(x_train,y_train)
ridge.fit(x_train,y_train)
elastic.fit(x_train,y_train)
ransac.fit(x_train,y_train)

# predict
yhat_linear_model = lr.predict(x_val)
yhat_lasso = lasso.predict(x_val)
yhat_ridge = ridge.predict(x_val)
yhat_elastic = elastic.predict(x_val)
yhat_ransac = ransac.predict(x_val)


In [14]:
# metrics - linear regression

r2_lm = mt.r2_score(y_val, yhat_linear_model)
mse_lm = mt.mean_squared_error(y_val, yhat_linear_model)
rmse_lm = np.sqrt(mse_lm)
mae_lm = mt.mean_absolute_error(y_val, yhat_linear_model)
mape_lm = mt.mean_absolute_percentage_error(y_val, yhat_linear_model)
print(f'r2: {r2_lm:.3f}')
print(f'mse: {mse_lm:.3f}')
print(f'rmse: {rmse_lm:.3f}')
print(f'mae: {mae_lm:.3f}')
print(f'mape: {mape_lm:.3f}')

r2: 0.040
mse: 458.447
rmse: 21.411
mae: 17.040
mape: 8.683


In [33]:
# metrics - lasso

r2_lasso = mt.r2_score(y_val, yhat_lasso)
mse_lasso = mt.mean_squared_error(y_val, yhat_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mt.mean_absolute_error(y_val, yhat_lasso)
mape_lasso = mt.mean_absolute_percentage_error(y_val, yhat_lasso)
print(f'r2: {r2_lasso:.3f}')
print(f'mse: {mse_lasso:.3f}')
print(f'rmse: {rmse_lasso:.3f}')
print(f'mae: {mae_lasso:.3f}')
print(f'mape: {mape_lasso:.3f}')

r2: 0.008
mse: 473.747
rmse: 21.766
mae: 17.265
mape: 8.696


In [16]:
# metrics - ridge

r2_ridge = mt.r2_score(y_val, yhat_ridge)
mse_ridge = mt.mean_squared_error(y_val, yhat_ridge)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mt.mean_absolute_error(y_val, yhat_ridge)
mape_ridge = mt.mean_absolute_percentage_error(y_val, yhat_ridge)
print(f'r2: {r2_ridge:.3f}')
print(f'mse: {mse_ridge:.3f}')
print(f'rmse: {rmse_ridge:.3f}')
print(f'mae: {mae_ridge:.3f}')
print(f'mape: {mape_ridge:.3f}')

r2: 0.040
mse: 458.453
rmse: 21.412
mae: 17.036
mape: 8.680


In [17]:
# metrics - elastic net

r2_en = mt.r2_score(y_val, yhat_elastic)
mse_en = mt.mean_squared_error(y_val, yhat_elastic)
rmse_en = np.sqrt(mse_en)
mae_en = mt.mean_absolute_error(y_val, yhat_elastic)
mape_en = mt.mean_absolute_percentage_error(y_val, yhat_elastic)
print(f'r2: {r2_en:.3f}')
print(f'mse: {mse_en:.3f}')
print(f'rmse: {rmse_en:.3f}')
print(f'mae: {mae_en:.3f}')
print(f'mape: {mape_en:.3f}')

r2: -0.000
mse: 477.512
rmse: 21.852
mae: 17.353
mape: 8.679


In [18]:
# metrics - ransac

r2_ransac = mt.r2_score(y_val, yhat_ransac)
mse_ransac = mt.mean_squared_error(y_val, yhat_ransac)
rmse_ransac = np.sqrt(mse_ransac)
mae_ransac = mt.mean_absolute_error(y_val, yhat_ransac)
mape_ransac = mt.mean_absolute_percentage_error(y_val, yhat_ransac)
print(f'r2: {r2_ransac:.3f}')
print(f'mse: {mse_ransac:.3f}')
print(f'rmse: {rmse_ransac:.3f}')
print(f'mae: {mae_ransac:.3f}')
print(f'mape: {mape_ransac:.3f}')

r2: -0.371
mse: 654.878
rmse: 25.591
mae: 19.969
mape: 9.483


### Test dataset

In [19]:
# model
# Linear regression
lr = lm.LinearRegression()

# lasso l1 - tende a zerar os parametros
lasso = lm.Lasso(alpha= 20)

# ridge l2 - tende a suavizar os parametros
ridge = lm.Ridge(alpha= 20)

# elasticnet - L1 e L2
elastic = lm.ElasticNet(alpha=20)

# RANSAC
ransac = lm.RANSACRegressor()

# fit 
lr.fit(np.concatenate((x_train,x_val)),
                np.concatenate((y_train,y_val)))
lasso.fit(np.concatenate((x_train,x_val)),
                np.concatenate((y_train,y_val)))
ridge.fit(np.concatenate((x_train,x_val)),
                np.concatenate((y_train,y_val)))
elastic.fit(np.concatenate((x_train,x_val)),
                np.concatenate((y_train,y_val)))
ransac.fit(np.concatenate((x_train,x_val)),
                np.concatenate((y_train,y_val)))

# predict
yhat_linear_model = lr.predict(x_test)
yhat_lasso = lasso.predict(x_test)
yhat_ridge = ridge.predict(x_test)
yhat_elastic = elastic.predict(x_test)
yhat_ransac = ransac.predict(x_test)



In [20]:
# metrics - linear regression

r2_lm = mt.r2_score(y_test, yhat_linear_model)
mse_lm = mt.mean_squared_error(y_test, yhat_linear_model)
rmse_lm = np.sqrt(mse_lm)
mae_lm = mt.mean_absolute_error(y_test, yhat_linear_model)
mape_lm = mt.mean_absolute_percentage_error(y_test, yhat_linear_model)
print(f'r2: {r2_lm:.3f}')
print(f'mse: {mse_lm:.3f}')
print(f'rmse: {rmse_lm:.3f}')
print(f'mae: {mae_lm:.3f}')
print(f'mape: {mape_lm:.3f}')

r2: 0.051
mse: 461.988
rmse: 21.494
mae: 17.144
mape: 8.531


In [21]:
# metrics - lasso

r2_lasso = mt.r2_score(y_test, yhat_lasso)
mse_lasso = mt.mean_squared_error(y_test, yhat_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mt.mean_absolute_error(y_test, yhat_lasso)
mape_lasso = mt.mean_absolute_percentage_error(y_test, yhat_lasso)
print(f'r2: {r2_lasso:.3f}')
print(f'mse: {mse_lasso:.3f}')
print(f'rmse: {rmse_lasso:.3f}')
print(f'mae: {mae_lasso:.3f}')
print(f'mape: {mape_lasso:.3f}')

r2: -0.000
mse: 486.959
rmse: 22.067
mae: 17.552
mape: 8.714


In [22]:
# metrics - ridge

r2_ridge = mt.r2_score(y_test, yhat_ridge)
mse_ridge = mt.mean_squared_error(y_test, yhat_ridge)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mt.mean_absolute_error(y_test, yhat_ridge)
mape_ridge = mt.mean_absolute_percentage_error(y_test, yhat_ridge)
print(f'r2: {r2_ridge:.3f}')
print(f'mse: {mse_ridge:.3f}')
print(f'rmse: {rmse_ridge:.3f}')
print(f'mae: {mae_ridge:.3f}')
print(f'mape: {mape_ridge:.3f}')

r2: 0.051
mse: 462.064
rmse: 21.496
mae: 17.142
mape: 8.543


In [23]:
# metrics - elastic net

r2_en = mt.r2_score(y_test, yhat_elastic)
mse_en = mt.mean_squared_error(y_test, yhat_elastic)
rmse_en = np.sqrt(mse_en)
mae_en = mt.mean_absolute_error(y_test, yhat_elastic)
mape_en = mt.mean_absolute_percentage_error(y_test, yhat_elastic)
print(f'r2: {r2_en:.3f}')
print(f'mse: {mse_en:.3f}')
print(f'rmse: {rmse_en:.3f}')
print(f'mae: {mae_en:.3f}')
print(f'mape: {mape_en:.3f}')

r2: -0.000
mse: 486.959
rmse: 22.067
mae: 17.552
mape: 8.714


In [24]:
# metrics - ransac

r2_ransac = mt.r2_score(y_test, yhat_ransac)
mse_ransac = mt.mean_squared_error(y_test, yhat_ransac)
rmse_ransac = np.sqrt(mse_ransac)
mae_ransac = mt.mean_absolute_error(y_test, yhat_ransac)
mape_ransac = mt.mean_absolute_percentage_error(y_test, yhat_ransac)
print(f'r2: {r2_ransac:.3f}')
print(f'mse: {mse_ransac:.3f}')
print(f'rmse: {rmse_ransac:.3f}')
print(f'mae: {mae_ransac:.3f}')
print(f'mape: {mape_ransac:.3f}')

r2: -0.871
mse: 910.976
rmse: 30.182
mae: 23.936
mape: 8.904
