In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


import warnings
warnings.filterwarnings('ignore')

from scipy.stats import skew
# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm



In [2]:
df = pd.read_csv("datos/bikes_EDA_casual.csv", index_col=0)
df.drop("casual_BOX", inplace=True, axis =1)
df.head(5)

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,casual
0,2018-01-01,winter,0,1,1,Monday,0,2,14.110847,80.5833,10.749882,331
1,2018-01-02,winter,0,1,0,Tuesday,1,2,14.902598,69.6087,16.652113,131
2,2018-01-03,winter,0,1,0,Wednesday,1,1,8.050924,43.7273,16.636703,120
3,2018-01-04,winter,0,1,0,Thursday,1,1,8.2,59.0435,10.739832,108
4,2018-01-05,winter,0,1,0,Friday,1,1,9.305237,43.6957,12.5223,82


In [3]:
descript = df.describe().T
descript

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
yr,730.0,0.5,0.500343,0.0,0.0,0.5,1.0,1.0
mnth,730.0,6.526027,3.450215,1.0,4.0,7.0,10.0,12.0
holiday,730.0,0.030137,0.171081,0.0,0.0,0.0,0.0,1.0
workingday,730.0,0.687671,0.463761,0.0,0.0,1.0,1.0,1.0
weathersit,730.0,1.394521,0.544807,1.0,1.0,1.0,2.0,3.0
temp,730.0,20.319259,7.506729,2.424346,13.811885,20.465826,26.880615,35.328347
hum,730.0,62.765175,14.237589,0.0,52.0,62.625,72.989575,97.25
windspeed,730.0,12.76362,5.195841,1.500244,9.04165,12.125325,15.625589,34.000021
casual,730.0,849.249315,686.479875,2.0,316.25,717.0,1096.5,3410.0


In [4]:
mediana_casual = descript.loc["casual", "50%"]
mediana_casual

717.0

In [5]:
lista_categoricas = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'casual_BOX']

In [6]:
df_season = df.groupby('season')["casual"].median().reset_index().sort_values(by = "casual")
df_season

Unnamed: 0,season,casual
3,winter,219.5
0,autumn,544.5
1,spring,867.0
2,summer,1050.5


In [7]:
print(f'winter: {round(df_season.loc[3, "casual"]/mediana_casual, 3)}')
print(f'autumn: {round(df_season.loc[0, "casual"]/mediana_casual, 3)}')
print(f'spring: {round(df_season.loc[1, "casual"]/mediana_casual, 3)}')
print(f'summer: {round(df_season.loc[2, "casual"]/mediana_casual, 3)}')

winter: 0.306
autumn: 0.759
spring: 1.209
summer: 1.465


In [8]:
df_yr = df.groupby('yr')["casual"].median().reset_index().sort_values(by = "casual")
df_yr

Unnamed: 0,yr,casual
0,0,614.0
1,1,905.0


In [9]:
print(f'0: {round(df_yr.loc[0, "casual"]/mediana_casual, 3)}')
print(f'1: {round(df_yr.loc[1, "casual"]/mediana_casual, 3)}')

0: 0.856
1: 1.262


In [10]:
df_mnth = df.groupby('mnth')["casual"].median().reset_index().sort_values(by = "casual")
df_mnth

Unnamed: 0,mnth,casual
0,1,126.5
1,2,202.5
11,12,292.5
10,11,449.0
2,3,508.5
9,10,771.5
3,4,832.5
4,5,887.0
5,6,968.5
8,9,970.0


In [11]:
print(f'1: {round(df_mnth.loc[0, "casual"]/mediana_casual, 3)}')
print(f'2: {round(df_mnth.loc[1, "casual"]/mediana_casual, 3)}')
print(f'3: {round(df_mnth.loc[2, "casual"]/mediana_casual, 3)}')
print(f'4: {round(df_mnth.loc[3, "casual"]/mediana_casual, 3)}')
print(f'5: {round(df_mnth.loc[4, "casual"]/mediana_casual, 3)}')
print(f'6: {round(df_mnth.loc[5, "casual"]/mediana_casual, 3)}')
print(f'7: {round(df_mnth.loc[6, "casual"]/mediana_casual, 3)}')
print(f'8: {round(df_mnth.loc[7, "casual"]/mediana_casual, 3)}')
print(f'9: {round(df_mnth.loc[8, "casual"]/mediana_casual, 3)}')
print(f'10: {round(df_mnth.loc[9, "casual"]/mediana_casual, 3)}')
print(f'11: {round(df_mnth.loc[10, "casual"]/mediana_casual, 3)}')
print(f'12: {round(df_mnth.loc[11, "casual"]/mediana_casual, 3)}')

1: 0.176
2: 0.282
3: 0.709
4: 1.161
5: 1.237
6: 1.351
7: 1.457
8: 1.55
9: 1.353
10: 1.076
11: 0.626
12: 0.408


In [12]:
df_holiday = df.groupby('holiday')["casual"].median().reset_index().sort_values(by = "casual")
df_holiday

Unnamed: 0,holiday,casual
0,0,717.0
1,1,942.0


In [13]:
print(f'0: {round(df_holiday.loc[0, "casual"]/mediana_casual, 3)}')
print(f'1: {round(df_holiday.loc[1, "casual"]/mediana_casual, 3)}')

0: 1.0
1: 1.314


In [14]:
df_weekday = df.groupby('weekday')["casual"].median().reset_index().sort_values(by = "casual")
df_weekday

Unnamed: 0,weekday,casual
0,Friday,537.5
2,Saturday,610.0
4,Thursday,628.5
6,Wednesday,666.5
3,Sunday,876.5
5,Tuesday,982.0
1,Monday,1434.0


In [15]:
print(f'Monday: {round(df_weekday.loc[1, "casual"]/mediana_casual, 3)}')
print(f'Tuesday: {round(df_weekday.loc[5, "casual"]/mediana_casual, 3)}')
print(f'Sunday	: {round(df_weekday.loc[3, "casual"]/mediana_casual, 3)}')
print(f'Wednesday: {round(df_weekday.loc[6, "casual"]/mediana_casual, 3)}')
print(f'Saturday: {round(df_weekday.loc[2, "casual"]/mediana_casual, 3)}')
print(f'Friday: {round(df_weekday.loc[0, "casual"]/mediana_casual, 3)}')
print(f'Thursday: {round(df_weekday.loc[4, "casual"]/mediana_casual, 3)}')

Monday: 2.0
Tuesday: 1.37
Sunday	: 1.222
Wednesday: 0.93
Saturday: 0.851
Friday: 0.75
Thursday: 0.877


In [16]:
df_weathersit = df.groupby('weathersit')["casual"].median().reset_index().sort_values(by = "casual")
df_weathersit

Unnamed: 0,weathersit,casual
2,3,126.0
1,2,535.0
0,1,829.0


In [17]:
print(f'3: {round(df_weathersit.loc[2, "casual"]/mediana_casual, 3)}')
print(f'2: {round(df_weathersit.loc[1, "casual"]/mediana_casual, 3)}')
print(f'1: {round(df_weathersit.loc[0, "casual"]/mediana_casual, 3)}')

3: 0.176
2: 0.746
1: 1.156


In [18]:
df_workingday = df.groupby('workingday')["casual"].median().reset_index().sort_values(by = "casual")
df_workingday

Unnamed: 0,workingday,casual
1,1,717.0
0,0,722.0


In [19]:
print(f'1: {round(df_workingday.loc[1, "casual"]/mediana_casual, 3)}')
print(f'0: {round(df_workingday.loc[0, "casual"]/mediana_casual, 3)}')

1: 1.0
0: 1.007


In [20]:
df["season_encoding"] = df["season"].map({"winter": 0.3, "autumn":0.8, 
                                          "spring": 1.3, "summer": 1.3})
df['yr_encoding'] = df['yr'].map({0:round(df_yr.loc[0, "casual"]/mediana_casual, 3), 1:round(df_yr.loc[1, "casual"]/mediana_casual, 3)})
df['mnth_encoding'] = df['mnth'].map({1:round(df_mnth.loc[0, "casual"]/mediana_casual, 3), 2: round(df_mnth.loc[1, "casual"]/mediana_casual, 3), 
                                      3: round(df_mnth.loc[2, "casual"]/mediana_casual, 3), 4: round(df_mnth.loc[3, "casual"]/mediana_casual, 3), 
                                      5:round(df_mnth.loc[4, "casual"]/mediana_casual, 3), 6:round(df_mnth.loc[5, "casual"]/mediana_casual, 3), 
                                      7: round(df_mnth.loc[6, "casual"]/mediana_casual, 3), 8:round(df_mnth.loc[7, "casual"]/mediana_casual, 3), 
                                      9:round(df_mnth.loc[8, "casual"]/mediana_casual, 3), 10:round(df_mnth.loc[9, "casual"]/mediana_casual, 3), 
                                      11:round(df_mnth.loc[10, "casual"]/mediana_casual, 3), 12:round(df_mnth.loc[11, "casual"]/mediana_casual, 3)})
df['holiday_encoding'] = df['holiday'].map({0:round(df_holiday.loc[0, "casual"]/mediana_casual, 3), 1:round(df_holiday.loc[1, "casual"]/mediana_casual, 3)})

df['weekday_encoding'] = df['weekday'].map({'Monday':round(df_weekday.loc[1, "casual"]/mediana_casual, 3), 
                                            'Tuesday':round(df_weekday.loc[5, "casual"]/mediana_casual, 3), 
                                            'Wednesday':round(df_weekday.loc[6, "casual"]/mediana_casual, 3), 
                                            'Thursday':round(df_weekday.loc[4, "casual"]/mediana_casual, 3), 
                                            'Friday':round(df_weekday.loc[0, "casual"]/mediana_casual, 3), 
                                            'Saturday':round(df_weekday.loc[2, "casual"]/mediana_casual, 3), 
                                            'Sunday':round(df_weekday.loc[3, "casual"]/mediana_casual, 3)})
df['weathersit_encoding'] = df['weathersit'].map({1:round(df_weathersit.loc[0, "casual"]/mediana_casual, 3), 
                                                  2:round(df_weathersit.loc[1, "casual"]/mediana_casual, 3), 
                                                  3:round(df_weathersit.loc[2, "casual"]/mediana_casual, 3)})
#df['workingday_encoding'] = df['workingday'].map({0:round(df_workingday.loc[0, "casual"]/mediana_casual, 3), 
                                                  #1:round(df_workingday.loc[1, "casual"]/mediana_casual, 3)})

In [21]:
oh = OneHotEncoder()
transformados_wd = oh.fit_transform(df[["workingday"]])
oh_df = pd.DataFrame(transformados_wd.toarray())
oh_df.columns = oh.get_feature_names_out()
df = pd.concat([df,oh_df],axis=1)

In [22]:
df.drop(columns=['dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], inplace=True)

In [23]:
X = df.drop("casual", axis = 1)
y = df["casual"]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)

In [25]:
arbol = DecisionTreeRegressor(random_state = 2)

arbol.fit(x_train, y_train)

In [26]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.3166247903554

In [27]:
print(arbol.tree_.max_depth)

22


In [28]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [29]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [30]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,277.041096,191976.205479,438.150894,0.55168,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I


In [31]:
param = {"max_depth": [5, 6, 7, 8, 9, 10, 11], 
        "max_features": [2,3,4,5],
        "min_samples_split": [20, 30, 50],
        "min_samples_leaf": [20, 30, 50, ],
        "random_state": [0,1,2,3,4,5,6,7,8,9,10,23]} 

In [32]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [33]:
gs.fit(x_train, y_train)

In [34]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [35]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [36]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,328.834071,258851.704084,508.774709,0.395506,test,Decision tree II
1,250.122395,134038.863895,366.113185,0.721142,train,Decision tree II


In [37]:
df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,277.041096,191976.205479,438.150894,0.55168,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,328.834071,258851.704084,508.774709,0.395506,test,Decision tree II
1,250.122395,134038.863895,366.113185,0.721142,train,Decision tree II


In [38]:
param2 = {"max_depth": [ 5, 6, 7, 8, 9, 10, 11], 
        "max_features": [3,4,5,6,7],
        "min_samples_split": [15, 20, 30, 50],
        "min_samples_leaf": [15, 20, 30, 50],
        "random_state": [0,1,2,3,4,5,6,7,8,9,10]} 

In [39]:
gs2 = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param2, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [40]:
gs2.fit(x_train, y_train)

In [41]:
mejor_modelo2 = gs2.best_estimator_
mejor_modelo2

In [42]:
y_pred_test_dt3 = mejor_modelo2.predict(x_test)
y_pred_train_dt3 = mejor_modelo2.predict(x_train)

In [43]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,309.306002,251704.566714,501.701671,0.412196,test,Decision tree III
1,233.736927,128433.449877,358.376129,0.732804,train,Decision tree III


In [44]:
df_decision_results = pd.concat([df_decision_results, dt_results3], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,277.041096,191976.205479,438.150894,0.55168,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,328.834071,258851.704084,508.774709,0.395506,test,Decision tree II
1,250.122395,134038.863895,366.113185,0.721142,train,Decision tree II
0,309.306002,251704.566714,501.701671,0.412196,test,Decision tree III
1,233.736927,128433.449877,358.376129,0.732804,train,Decision tree III


In [45]:
bosque = RandomForestRegressor(max_depth=6, max_features=5, min_samples_leaf=15,
                      min_samples_split=15, random_state=1)
bosque.fit(x_train, y_train)

In [46]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [47]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,265.018275,152958.960801,391.099681,0.642796,test,Random Forest
1,240.668147,120625.017769,347.311125,0.749049,train,Random Forest


In [52]:
param_rf = {"n_estimators": [250,500],
        "max_depth": [ 5, 6, 7, 8, 9], 
        "max_features": [3,4,5,6],
        "min_samples_split": [15, 20, 50, 70],
        "min_samples_leaf": [10, 15, 20, 30]}

In [53]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(random_state=1), 
            param_grid= param_rf, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [54]:
gs_rf.fit(x_train, y_train)

In [55]:
bosque2 = gs_rf.best_estimator_
bosque2

In [56]:
y_pred_test_rf2 = bosque2.predict(x_test)
y_pred_train_rf2 = bosque2.predict(x_train)

In [57]:
dt_results_rf2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest II")
dt_results_rf2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,244.439446,142580.626173,377.598499,0.667033,test,Random Forest II
1,193.951723,83277.67458,288.578715,0.826747,train,Random Forest II


In [254]:
bosque3 = RandomForestRegressor(max_depth=7, max_features=4, min_samples_leaf=5,
                      min_samples_split=30, n_estimators=500, random_state=0)
bosque3.fit(x_train, y_train)

In [255]:
y_pred_test_rf3 = bosque3.predict(x_test)
y_pred_train_rf3 = bosque3.predict(x_train)

In [256]:
dt_results_rf3 = metricas(y_test, y_train, y_pred_test_rf3, y_pred_train_rf3, "Random Forest III")
dt_results_rf3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,252.826199,141793.54323,376.554834,0.668871,test,Random Forest III
1,224.058966,106023.935239,325.613168,0.779425,train,Random Forest III
