## Importação de bibliotecas

In [1]:
import pandas
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

## Leitura de datasets

In [2]:
df_1 = pandas.read_csv('input/data_1__.csv')
df_2 = pandas.read_csv('input/data_2__.csv')
df_3 = pandas.read_csv('input/data_3__.csv')
df_4 = pandas.read_csv('input/data_4__.csv')

## Definição de funções úteis

In [3]:
def convertDateToYearMonth(date):
    date = date.split("-")
    date = date[0]+date[1]
    return date

In [4]:
def preprocessorDataFrame(df):
    df['YearMonth'] = df['data'].apply(lambda date: convertDateToYearMonth(date))
    dfReplaced = df.groupby(['YearMonth'])['valor'].sum().reset_index()
    dfReplaced['YearMonth'] = dfReplaced['YearMonth'].apply(lambda ym: int(ym))
    return dfReplaced

## Preprocessamento df_4

In [5]:
print(df_4.groupby('Product id').size())
## Como nessa coluna só existe um tipo de Product id, isso não será relevante

Product id
pd_1    60
dtype: int64


In [6]:
df_4_replaced = df_4
df_4_replaced = df_4_replaced.rename(columns= {"Volume": 'valor'})
df_4_replaced = df_4_replaced.groupby(['YearMonth'])['valor'].sum().reset_index()
df_4_replaced['YearMonth'] = df_4_replaced['YearMonth'].apply(lambda ym: int(ym))
df_4_replaced.tail()

Unnamed: 0,YearMonth,valor
55,201708,27.3492
56,201709,25.2192
57,201710,23.9412
58,201711,28.2012
59,201712,22.0668


## Prepocessamento df_3

In [7]:
df_3_replaced = preprocessorDataFrame(df_3)
df_3_replaced.tail()

Unnamed: 0,YearMonth,valor
0,201808,83050.67
1,201809,296706.28
2,201810,275052.14


## Prepocessamento df_2

In [8]:
df_2_replaced = preprocessorDataFrame(df_2)
df_2_replaced.tail()

Unnamed: 0,YearMonth,valor
47,201806,379383.08
48,201807,482195.78
49,201808,434308.16
50,201809,438753.21
51,201810,457025.59


## Prepocessamento df_1

In [9]:
df_1_replaced = preprocessorDataFrame(df_1)
df_1_replaced.tail()

Unnamed: 0,YearMonth,valor
22,201806,759047.88
23,201807,711501.72
24,201808,700150.67
25,201809,747653.22
26,201810,635530.09


## Predições

In [10]:
def linearRegresionPredict(df):
    model = LinearRegression()
    last_valor = df['valor'].values[-1]
    last_yearMonth = df['YearMonth'].values[-1]

    df = df.drop(df.index[-1])

    x_test = [[last_yearMonth]]
    y_test = [[last_valor]]

    print("X test: ", x_test, "y test: ", y_test)

    X_train = pandas.DataFrame(df['YearMonth'])
    y_train = pandas.DataFrame(df['valor'])

    model.fit(X_train,y_train)

    y_pred = model.predict(x_test)
    
    print('predição: ', y_pred)
    print('valor original: ', y_test)
    print('MAE:', metrics.mean_absolute_error(y_test, y_pred))  
    print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Predições

#### DF_1

In [11]:
linearRegresionPredict(df_1_replaced)

X test:  [[201810]] y test:  [[635530.0900000001]]
predição:  [[716782.45822836]]
valor original:  [[635530.0900000001]]
MAE: 81252.3682283645
MSE: 6601947342.717737
RMSE: 81252.3682283645


#### DF_2

In [12]:
linearRegresionPredict(df_2_replaced)

X test:  [[201810]] y test:  [[457025.5900000001]]
predição:  [[461926.1480768]]
valor original:  [[457025.5900000001]]
MAE: 4900.558076802408
MSE: 24015469.464113317
RMSE: 4900.558076802408


#### DF_3

In [13]:
linearRegresionPredict(df_3_replaced)

X test:  [[201810]] y test:  [[275052.13999999996]]
predição:  [[510361.88999939]]
valor original:  [[275052.13999999996]]
MAE: 235309.7499993897
MSE: 55370678444.775276
RMSE: 235309.7499993897


#### DF_4

In [14]:
linearRegresionPredict(df_4_replaced)

X test:  [[201712]] y test:  [[22.0668]]
predição:  [[2503.97503045]]
valor original:  [[22.0668]]
MAE: 2481.9082304542976
MSE: 6159868.464396783
RMSE: 2481.9082304542976


### Recarregando dados

In [15]:
# df_4_replaced = df_4
# df_4_replaced = df_4_replaced.rename(columns= {"Volume": 'valor'})
# df_4_replaced = df_4_replaced.groupby(['YearMonth'])['valor'].sum().reset_index()
# df_4_replaced['YearMonth'] = df_4_replaced['YearMonth'].apply(lambda ym: int(ym))

# df_3_replaced = preprocessorDataFrame(df_3)
# df_2_replaced = preprocessorDataFrame(df_2)
# df_1_replaced = preprocessorDataFrame(df_1)

### Predições com xgb regressor

In [16]:
# modelo_xgb = xgb.sklearn.XGBRegressor(random_state=1, max_depth=5, learning_rate=0.01, n_estimators=1000)
# df_1_replaced['valor'] = df_1_replaced['valor'].apply(lambda ym: int(ym))
# last_valor = df_1_replaced['valor'].values[-1]
# last_yearMonth = df_1_replaced['YearMonth'].values[-1]

# eval_set_valor = df_1_replaced['valor'].values[-1]
# eval_set_yearMonth = df_1_replaced['YearMonth'].values[-1]

# df_1_replaced = df_1_replaced.drop(df_1_replaced.index[-2])

# X = df_1_replaced.values[:, :-1]
# y = df_1_replaced.values[:, -1]

# eval_set = [([eval_set_valor], [eval_set_yearMonth])]

# modelo_xgb.fit(X, y, early_stopping_rounds=100, eval_set=eval_set, verbose=False)

# # 635775
# # modelo_xgb.fit(X, y, early_stopping_rounds=100, eval_set=eval_set, verbose=False)

# resposta = modelo_xgb.predict(last_yearMonth)
# print('resposta: ', resposta)

In [None]:
# print("MSE: ")
# print(metrics.mean_squared_error(last_valor, resposta))
# print("RMSE: ")
# print(math.sqrt(metrics.mean_squared_error(last_valor, resposta)))
# print("MAE: ")
# print(metrics.mean_absolute_error([[last_valor]], [[int(resposta)]]))