# Modelo Inicial de Machine Learning

In [22]:
import pandas as pd
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from lazypredict.Supervised import LazyRegressor

In [3]:
# Leer los datos
file_path = '../data/interim/bike_sharing_cleaned.csv'
df = pd.read_csv(file_path)

In [6]:
# Correr el nb previo para obtener funciones necesarias
%run ./bike_sharing_ml_project.ipynb

In [7]:
# Al pasar el csv al dvc no s guardaron los tipos de datos, por lo que es necesario corregirlos
df = correct_initial_data_types(df)
df = finalize_data_types(df)


Iniciando corrección de tipos de datos...
Tipos de datos corregidos de forma semántica.

Puliendo los tipos de datos finales...
Tipos de datos finalizados.


In [8]:
df.dtypes

dteday        datetime64[ns]
season              category
yr                  category
mnth                category
hr                  category
holiday             category
weekday             category
workingday          category
weathersit          category
temp                 float64
atemp                float64
hum                  float64
windspeed            float64
casual                 int64
registered             int64
cnt                    int64
dtype: object

In [20]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [14]:
# Definir columnas categoricas y numericas para su analisis exploratorio
cat_cols = df.select_dtypes(include=["category"]).columns.tolist()
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
dt_cols = df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()


In [27]:
target = 'cnt'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [28]:
reg = LazyRegressor(ignore_warnings=True, random_state=42, verbose=0)
models_df, predictions = reg.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 824
[LightGBM] [Info] Number of data points in the train set: 12026, number of used features: 6
[LightGBM] [Info] Start training from score 189.626060


In [None]:
y_true = np.asarray(y_test).reshape(-1)
n_test = y_true.shape[0]

rows = []
skipped = []

for model_name, y_pred in predictions.items():
    y_hat = np.asarray(y_pred).reshape(-1)
    if y_hat.shape[0] != n_test:
        skipped.append((model_name, f"len={y_hat.shape[0]} vs {n_test}"))
        continue
    if not np.all(np.isfinite(y_hat)) or not np.all(np.isfinite(y_true)):
        skipped.append((model_name, "non-finite"))
        continue

    rows.append({
        "Model": model_name,
        "RMSE": rmse(y_true, y_hat),
        "MAE": mean_absolute_error(y_true, y_hat),
        "R2": r2_score(y_true, y_hat),
    })

metrics_df = pd.DataFrame(rows)

In [30]:
display(metrics_df)