In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
# ------------------------------------------------------------------------------
from tqdm import tqdm

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

In [2]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [3]:
df = pd.read_csv("./datos/estandarizados2.csv", index_col=0)
df.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,1,1,1,0,2,-0.778781,-0.632082,1.232734,-0.391485,-0.77184,654,985
1,2,0,0,1,0,2,1,2,-0.67424,-0.691755,0.469324,0.731665,-1.155857,670,801
2,3,0,0,1,0,3,1,1,-1.580704,-1.692801,-1.331685,0.729762,-1.176978,1229,1349
3,4,0,0,1,0,4,1,1,-1.560855,-1.553969,-0.266251,-0.393388,-1.200019,1454,1562
4,5,0,0,1,0,5,1,1,-1.413968,-1.450455,-1.333773,-0.05454,-1.249941,1518,1600


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 686 entries, 0 to 685
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     686 non-null    int64  
 1   season      686 non-null    int64  
 2   yr          686 non-null    int64  
 3   mnth        686 non-null    int64  
 4   holiday     686 non-null    int64  
 5   weekday     686 non-null    int64  
 6   workingday  686 non-null    int64  
 7   weathersit  686 non-null    int64  
 8   temp        686 non-null    float64
 9   atemp       686 non-null    float64
 10  hum         686 non-null    float64
 11  windspeed   686 non-null    float64
 12  casual      686 non-null    float64
 13  registered  686 non-null    int64  
 14  cnt         686 non-null    int64  
dtypes: float64(5), int64(10)
memory usage: 85.8 KB


In [5]:
df["season"].value_counts()

0    179
3    172
1    171
2    164
Name: season, dtype: int64

In [6]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [7]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [2,4, 6], # teniendo en cuenta que teníamos overfitting tendremos que reducir la profundidad del modelo, la nuestra anterior era de 17. Bajaremos mucho este valor ya que teníamos un overfitting muy claro
        "max_features": [1,2,3,4],# calculamos en celdas anteriores, probaremos a hacer el modelo como una variable, 2, 3 y 4. Ponemos como límite el 4 ya que es el resultado de la raiz cuadrada. 
        # estos dos hiperparámetros son más difíciles de definir, pero usualmente se suelen elegir los siguientes valores
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100],
        "random_state" :[0]} 

In [8]:
# una vez creado el diccionario iniciaremos el modelo con GridSearch

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [9]:
# en df2 vamos a dejar sólo las columnas de season, workingday, cnt
df3 = df.drop(['instant',  'yr', 'mnth', 'holiday', 'weekday','temp', 'atemp', 'hum', 'windspeed', 'casual', 
       'weathersit'], axis = 1)

In [10]:
df3.head()

Unnamed: 0,season,workingday,registered,cnt
0,0,0,654,985
1,0,1,670,801
2,0,1,1229,1349
3,0,1,1454,1562
4,0,1,1518,1600


In [11]:
season_cambio = {1:3, 0:2, 2:1, 3:0} 

In [12]:
df3["season"]=df3["season"].map(season_cambio) 

In [13]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 686 entries, 0 to 685
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   season      686 non-null    int64
 1   workingday  686 non-null    int64
 2   registered  686 non-null    int64
 3   cnt         686 non-null    int64
dtypes: int64(4)
memory usage: 26.8 KB


In [14]:
# separar X e y

X = df3.drop("registered", axis = 1)
y = df3["registered"]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [16]:
gs.fit(x_train, y_train)

In [17]:
mejor_modelo1 = gs.best_estimator_
mejor_modelo1

In [18]:
y_pred_test_dt4 = mejor_modelo1.predict(x_test)
y_pred_train_dt4 = mejor_modelo1.predict(x_train)

In [19]:
dt_results4 = metricas(y_test, y_train, y_pred_test_dt4, y_pred_train_dt4, "Decision tree IV")
dt_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,282.316428,149401.850423,386.525355,0.948264,test,Decision tree IV
1,251.611646,113783.760018,337.318485,0.952452,train,Decision tree IV


In [20]:
rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [21]:
rf.fit(x_train, y_train)

In [22]:
mejor_modelo2 = rf.best_estimator_
mejor_modelo2

In [23]:
y_pred_test_dt5 = mejor_modelo2.predict(x_test)
y_pred_train_dt5 = mejor_modelo2.predict(x_train)

In [24]:
dt_results5 = metricas(y_test, y_train, y_pred_test_dt5, y_pred_train_dt5, "Random Forrest I")
dt_results5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,284.786616,145316.311932,381.203767,0.949678,test,Random Forrest I
1,254.248679,115596.712312,339.995165,0.951694,train,Random Forrest I


In [25]:

df_decision_results = pd.concat([dt_results4, dt_results5], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,282.316428,149401.850423,386.525355,0.948264,test,Decision tree IV
1,251.611646,113783.760018,337.318485,0.952452,train,Decision tree IV
0,284.786616,145316.311932,381.203767,0.949678,test,Random Forrest I
1,254.248679,115596.712312,339.995165,0.951694,train,Random Forrest I


VAMOS A HACER LA PREDICCIÓN PARA EL DÍA DE MAÑANA

In [26]:
df_predict = pd.read_csv("./datos/2020_predict.csv", index_col=0)

In [27]:
dia_19102020 = {"season":3, "workingday":1, "cnt":8678}

In [28]:
df_dia = pd.DataFrame(dia_19102020, index = [0])
df_dia

Unnamed: 0,season,workingday,cnt
0,3,1,5424


In [29]:
prediccion = mejor_modelo2.predict(df_dia)
prediccion

array([4699.2377084])

In [32]:
df_predict = pd.read_csv("./datos/2020_predict.csv", index_col=0)
dia_19102020 = {"season":3, "workingday":1, "cnt":8678}
df_dia = pd.DataFrame(dia_19102020, index = [0])
df_dia
prediccion = mejor_modelo2.predict(df_dia)
prediccion

array([6611.79336731])

Nuestro modelo predice que para el día 19 octubre del 2020, tenemos que preveer 4700 bicicletas (redondeando), para los registrados.  

In [31]:
df.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,1,1,1,0,2,-0.778781,-0.632082,1.232734,-0.391485,-0.77184,654,985
1,2,0,0,1,0,2,1,2,-0.67424,-0.691755,0.469324,0.731665,-1.155857,670,801
2,3,0,0,1,0,3,1,1,-1.580704,-1.692801,-1.331685,0.729762,-1.176978,1229,1349
3,4,0,0,1,0,4,1,1,-1.560855,-1.553969,-0.266251,-0.393388,-1.200019,1454,1562
4,5,0,0,1,0,5,1,1,-1.413968,-1.450455,-1.333773,-0.05454,-1.249941,1518,1600
