In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import TimeSeriesSplit, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error
import math

from itertools import tee

## Carregamento dos dados

In [None]:
kaggle_data_path = "../input/aa2-covid-19-ade/"

In [None]:
confirmados_data = pd.read_csv(kaggle_data_path + "dataset_for_confirmed")
obitos_data = pd.read_csv(kaggle_data_path + "dataset_for_deaths")

## **Modelo de Regressão Linear**

In [None]:
def data_split(data, target):
    y = data[target]
    x = data.drop([target], axis=1)
    
    return x, y


def split_for_model(data, train_count):
    train = data[:train_count] 
    test = data[train_count:]
    
    return train, test
    

def plot_prediction(data, prediction, y_train, target, train_size):
    aux_train = y_train.copy()
    combined = pd.concat([aux_train, pd.Series(prediction, index=list(range(train_size, data.shape[0])))])
    final = pd.concat([combined, data[target]], keys = ['previsão', 'real'], axis=1)
    final = final.tail(data.shape[0]-train_size + 5)
    ax = final.plot(figsize=(20,5))
    ax = plt.axvline(x=train_size-1, linewidth=2, ls = ':', color='grey', alpha=0.5)

# Experiências

In [None]:
i = 0
for col in confirmados_data.columns:
    print(i, " -> ", col)
    i+=1
    
    
#[ 1 3 7 8 23 24 25 26 27 28]

In [None]:
confirmados_data = pd.read_csv(kaggle_data_path + "dataset_for_confirmed")

In [None]:
train_size = confirmados_data.shape[0] - 7 
target ='confirmados'
x, y = data_split(confirmados_data, target)

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

x_train, x_test = split_for_model(x, train_size)
y_train, y_test = split_for_model(y, train_size)

selector = SelectKBest(f_regression, k=4)
new_x = selector.fit_transform(x_train, y_train)

cols = selector.get_support(indices=True)

confirmados_data = confirmados_data.iloc[:,cols]

print(cols)

In [None]:
x, y = data_split(confirmados_data, target)

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

x_train, x_test = split_for_model(x, train_size)
y_train, y_test = split_for_model(y, train_size)

tss_a = TimeSeriesSplit(n_splits=5).split(x_train)
tss_b, tss_c = tee(tss_a)
tss1, tss2 = tee(tss_b)
tss3, tss4 = tee(tss_c)

#scores = cross_validate(model, x_train, y_train, cv=tss,scoring=('r2', 'neg_root_mean_squared_error'),return_train_score=True,return_estimator=True)

lasso_params = {'alpha':[1, 0.5, 0.3, 0.1, 0.01, 0.02, 0.05, 0.001, 0.001]}
ridge_params = {'alpha':[500, 250, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0]}

lasso_params_poly = {'fit__alpha':[1, 0.5, 0.3, 0.1, 0.01, 0.02, 0.05, 0.001, 0.001]}
ridge_params_poly = {'fit__alpha':[500, 250, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0]}

ols = linear_model.LinearRegression()
lasso = linear_model.Lasso(max_iter=10000000)
ridge = linear_model.Ridge(max_iter=10000000)

pipe1 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', ols)])
pipe2 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', lasso)])
pipe3 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', ridge)])

models = {'OLS': ols,
          'Lasso': GridSearchCV(lasso, param_grid=lasso_params, cv=tss1, scoring='explained_variance').fit(x_train, y_train).best_estimator_,
          'Ridge': GridSearchCV(ridge, param_grid=ridge_params, cv=tss2, scoring='explained_variance').fit(x_train, y_train).best_estimator_,
          'OLSPoly': pipe1,
          'LassoPoly': GridSearchCV(pipe2, param_grid=lasso_params_poly, cv=tss3, scoring='explained_variance').fit(x_train, y_train).best_estimator_,
          'RidgePoly': GridSearchCV(pipe3, param_grid=ridge_params_poly, cv=tss4, scoring='explained_variance').fit(x_train, y_train).best_estimator_,}

#train_rmse = (scores['train_neg_root_mean_squared_error']*-1).mean()
#val_rmse = (scores['test_neg_root_mean_squared_error']*-1).mean()
#print('Train RMSE: {}'.format(train_rmse))
#print('Validation RMSE: {}'.format(val_rmse))

In [None]:
print(models['OLS'])
print(models['OLSPoly'])
print(models['Ridge'])
print(models['Lasso'])
print(models['RidgePoly'])
print(models['LassoPoly'])

In [None]:
models['LassoPoly'].fit(x_train, y_train)
prediction = models['LassoPoly'].predict(x_test)
print(prediction, y_test)
plot_prediction(confirmados_data, prediction, y_train, target, train_size)

In [None]:
test_rmse = np.sqrt(mean_squared_error(y_test,prediction))

print('Test RMSE: {}'.format(test_rmse))