# Deploy de um modelo de Machine Learning, usando Streamlit e Docker

Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro

In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np


In [23]:
# Lendo meu Dataset
dados = pd.read_csv("../data/itens.csv")
dados.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,200.0,1999,3,300000
4,100.0,2010,1,140000


In [24]:
dados.shape

(1478, 4)

In [25]:
dados.isna().sum()

tamanho    0
ano        0
garagem    0
preco      0
dtype: int64

In [26]:
dados.dtypes

tamanho    float64
ano          int64
garagem      int64
preco        int64
dtype: object

In [27]:
# Split do nosso DataFrame nas variáveis preditoras (features) e nossa target
y = dados['preco']
x = dados.drop('preco', axis='columns')

In [28]:
x

Unnamed: 0,tamanho,ano,garagem
0,159.0,2003,2
1,117.0,1976,2
2,166.0,2001,2
3,200.0,1999,3
4,100.0,2010,1
...,...,...,...
1473,117.0,1976,2
1474,166.0,2001,2
1475,159.0,2003,2
1476,117.0,1976,2


In [29]:
# Split em Dados de Train e Test
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234, test_size=0.30)

In [30]:
x_train.shape

(1034, 3)

In [31]:
x_test.shape

(444, 3)

In [32]:
y_train.shape

(1034,)

In [33]:
y_test.shape

(444,)

In [37]:
# Instanciamos os ALGORITMOS de Machine Learning
lm = LinearRegression()
dt = DecisionTreeRegressor(max_depth=10, random_state=1234)
rf = RandomForestRegressor(n_estimators=190, max_depth=8, random_state=1234)
gb = GradientBoostingRegressor(random_state=1234)


In [38]:
def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs(y_pred - y_true) / y_true)
    r2 = r2_score(y_true, y_pred)
    return {'rmse': rmse, 'mape':mape, 'r2': r2, 'mae': mae}
    

In [39]:
# Fit 
lm.fit(x_train, y_train)
rf.fit(x_train, y_train)
gb.fit(x_train, y_train)
dt.fit(x_train, y_train)

In [40]:
# Predict
y_pred_lm = lm.predict(x_test)
y_pred_rf = rf.predict(x_test)
y_pred_gb = gb.predict(x_test)
y_pred_dt = dt.predict(x_test)

In [41]:
# métrica do Gradiente Boosting
metrics(y_test, y_pred=y_pred_gb)

{'rmse': 1.731421055948538,
 'mape': 5.74875432993357e-06,
 'r2': 0.9999999992750928,
 'mae': 1.2988450270623855}

In [42]:
# métrica do Random Forest
metrics(y_test, y_pred=y_pred_rf)

{'rmse': 0.0, 'mape': 0.0, 'r2': 1.0, 'mae': 0.0}

In [43]:
# métrica da Linear Regression
metrics(y_test, y_pred=y_pred_lm)

{'rmse': 14955.018325097868,
 'mape': 0.05658830550772031,
 'r2': 0.9459182216523392,
 'mae': 12598.085719257811}

In [44]:
# métrica da Decision Tree
metrics(y_test, y_pred=y_pred_dt)

{'rmse': 0.0, 'mape': 0.0, 'r2': 1.0, 'mae': 0.0}

In [45]:
# Salvando o "Melhor modelo" escolhido

import pickle

with open("../models/model_rf.pkl", "wb") as arquivo_model:
    pickle.dump(rf, arquivo_model)