In [38]:
# Manejo de datos
import numpy as np
import seaborn as sns
import datetime as dt
# Gráficas
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
# Modelos
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from keras import layers, models, optimizers
from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
# Extras
from tqdm import tqdm

In [None]:
df_dt = tabla[['dteday']].copy()
df_dt['dteday'] = (df_dt['dteday'].map(dt.datetime.toordinal) - 734138)/365
X_dt = np.array(df_dt.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_dt, Y, test_size=0.3)
# regresion_dt = LinearRegression()
regresion_dt = make_pipeline(PolynomialFeatures(degree=5, include_bias=False), LinearRegression())
regresion_dt.fit(x_tr, y_tr)
y_dt = regresion_dt.predict(X_dt)
print('RMSE:', mean_squared_error(Y, y_dt, squared=False))
y_dt_te = regresion_dt.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_dt_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:6000], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_dt[5000:6000], mode='markers', name='model'))
fig.show()

RMSE: 141.78768196541597
RMSE: 142.16344366055765


In [2]:
tabla = pd.read_excel('bike_train.xlsx')

In [3]:
Y = np.array(tabla['cnt'].values)

# Entrenaremos modelos por separado para cada característica a utilizar

In [4]:
def weekday_to_isweekend(num):
    if num == 0 or num == 6:
        return 1
    else:
        return 0

## a) weekend

In [5]:
df_we = tabla[['weekday']].copy()
df_we['weekend'] = df_we['weekday'].map(weekday_to_isweekend)
df_we.drop(columns=['weekday'],inplace=True)
X_we = np.array(df_we.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_we, Y, test_size=0.3)
tree_we = DecisionTreeRegressor(criterion='squared_error', max_depth = 1)
tree_we.fit(x_tr, y_tr)
y_we = tree_we.predict(X_we)
print('RMSE:', mean_squared_error(Y, y_we, squared=False))
y_we_te = tree_we.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_we_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[:100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_we[:100], mode='markers', name='model'))
fig.show()

RMSE: 151.37336818123143
RMSE: 150.03335605406397


## b) hr

In [6]:
df_hr = tabla[['hr']].copy()
X_hr = np.array(df_hr.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_hr, Y, test_size=0.3)
tree_hr = DecisionTreeRegressor(criterion='squared_error', max_depth = 10)
tree_hr.fit(x_tr, y_tr)
y_hr = tree_hr.predict(X_hr)
print('RMSE:', mean_squared_error(Y, y_hr, squared=False))
y_hr_te = tree_hr.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_hr_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_hr[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 106.15573821505443
RMSE: 107.90481281525221


## c) weathersit

In [7]:
df_ws = tabla[['weathersit']].copy()
X_ws = np.array(df_ws.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_ws, Y, test_size=0.3)
tree_ws = DecisionTreeRegressor(criterion='squared_error', max_depth = 2)
tree_ws.fit(x_tr, y_tr)
y_ws = tree_ws.predict(X_ws)
print('RMSE:', mean_squared_error(Y, y_ws, squared=False))
y_ws_te = tree_ws.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_ws_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_ws[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 149.59119903827755
RMSE: 146.20077199555718


## d) mnth

In [8]:
df_mnth = tabla[['mnth']].copy()
X_mnth = np.array(df_mnth.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_mnth, Y, test_size=0.3)
tree_mnth = DecisionTreeRegressor(criterion='squared_error', max_depth = 3)
tree_mnth.fit(x_tr, y_tr)
y_mnth = tree_mnth.predict(X_mnth)
print('RMSE:', mean_squared_error(Y, y_mnth, squared=False))
y_mnth_te = tree_mnth.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_mnth_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_mnth[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 146.8013400978097
RMSE: 150.49121305317834


## e) atemp

In [9]:
df_atemp = tabla[['atemp']].copy()
X_atemp = np.array(df_atemp.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_atemp, Y, test_size=0.3)
tree_atemp = DecisionTreeRegressor(criterion='squared_error', max_depth = 4)
tree_atemp.fit(x_tr, y_tr)
y_atemp = tree_atemp.predict(X_atemp)
print('RMSE:', mean_squared_error(Y, y_atemp, squared=False))
y_atemp_te = tree_atemp.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_atemp_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_atemp[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 137.4423142510928
RMSE: 137.55903147440762


## dtetime

In [10]:
df_dt = tabla[['dteday']].copy()
df_dt['dteday'] = (df_dt['dteday'].map(dt.datetime.toordinal) - 734138)/365
X_dt = np.array(df_dt.values)
x_tr, x_te, y_tr, y_te = train_test_split(X_dt, Y, test_size=0.3)
regresion_dt = LinearRegression()
regresion_dt.fit(x_tr, y_tr)
y_dt = regresion_dt.predict(X_dt)
print('RMSE:', mean_squared_error(Y, y_dt, squared=False))
y_dt_te = regresion_dt.predict(x_te)
print('RMSE:', mean_squared_error(y_te, y_dt_te, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:6000], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y_dt[5000:6000], mode='markers', name='model'))
fig.show()

RMSE: 147.35179570478607
RMSE: 146.70180445484033


----

# Predecimos la cantidad de acuerdo a cada una de las características.
El resultado será una predicción por columna

In [12]:
# Creamos escaladores para cada uno de las cantidades a predecir 
scaler_we = StandardScaler()
scaler_hr = StandardScaler()
scaler_ws = StandardScaler()
scaler_mnth = StandardScaler()
scaler_atemp= StandardScaler()
scaler_dt= StandardScaler()
# Predecimos cantidad por cada una de las características y escalamos
scaled_we = scaler_we.fit_transform(y_we.reshape(-1,1))
scaled_hr = scaler_hr.fit_transform(y_hr.reshape(-1,1))
scaled_ws = scaler_ws.fit_transform(y_ws.reshape(-1,1))
scaled_mnth = scaler_mnth.fit_transform(y_mnth.reshape(-1,1))
scaled_atemp = scaler_atemp.fit_transform(y_atemp.reshape(-1,1))
scaled_dt = scaler_dt.fit_transform(y_dt.reshape(-1,1))
# Cremos df y matrix
s_we = pd.Series(scaled_we.reshape(-1), name='pred_we')
s_hr = pd.Series(scaled_hr.reshape(-1), name='pred_hr')
s_ws = pd.Series(scaled_ws.reshape(-1), name='pred_ws')
s_mnth = pd.Series(scaled_mnth.reshape(-1), name='pred_mnth')
s_atemp = pd.Series(scaled_atemp.reshape(-1), name='pred_atemp')
s_dt = pd.Series(scaled_dt.reshape(-1), name='pred_dt')
Final_df = pd.concat([s_we, s_hr, s_ws, s_mnth, s_atemp, s_dt], axis=1)
X = np.array(Final_df.values)

In [13]:
Final_df.head()

Unnamed: 0,pred_we,pred_hr,pred_ws,pred_mnth,pred_atemp,pred_dt
0,-1.571599,-1.055827,0.586632,-1.738879,-1.021316,-1.746591
1,-1.571599,-1.208463,0.586632,-1.738879,-1.021316,-1.746591
2,-1.571599,-1.276833,0.586632,-1.738879,-1.021316,-1.746591
3,-1.571599,-1.373932,0.586632,-1.738879,-1.021316,-1.746591
4,-1.571599,-1.422732,0.586632,-1.738879,-1.021316,-1.746591


# Usamos un modelo más para poder unir todas las anteriores

## a) Si usamos un arbol de decisión

In [36]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.3)
tree = DecisionTreeRegressor(criterion='squared_error', max_depth = 9)
tree.fit(x_tr, y_tr)
y = tree.predict(X)
y_te_pr = tree.predict(x_te)
print('RMSE:', mean_squared_error(Y, y, squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[:100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y[:100], mode='markers', name='model'))
fig.show()

RMSE: 51.44000800731319
RMSE: 55.62106310954571


## b) Si usamos una  regresión lineal

In [21]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.3)
# tree = DecisionTreeRegressor(criterion='squared_error', max_depth = 10)
lin_reg = LinearRegression()
lin_reg.fit(x_tr, y_tr)
y = lin_reg.predict(X)
y_te_pr = lin_reg.predict(x_te)
print('RMSE:', mean_squared_error(Y, y, squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 86.66431522371558
RMSE: 86.98650940307714


## c) Polinomial

In [30]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.3)
poly = make_pipeline(PolynomialFeatures(degree=3, include_bias=False), LinearRegression())
poly.fit(x_tr, y_tr)
y = poly.predict(X)
y_te_pr = poly.predict(x_te)
print('RMSE:', mean_squared_error(Y, y, squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 67.23706547326441
RMSE: 68.52962635955886


La mejor opción de polinomial fue en grado 3

## d) Si usamos Forest

In [56]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.3)
forest = RandomForestRegressor(max_depth=9)
forest.fit(x_tr, y_tr)
y = forest.predict(X)
y_te_pr = forest.predict(x_te)
print('RMSE:', mean_squared_error(Y, y, squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 44.81122793430774
RMSE: 47.23003303293814


Forest se desempeñó mejor con deep = 9

## e) SVG

In [41]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size=0.3)
svg = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.01)
svg.fit(x_tr, y_tr)
y = svg.predict(X)
y_te_pr = svg.predict(x_te)
print('RMSE:', mean_squared_error(Y, y, squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y.reshape(-1)[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 66.49471055801061
RMSE: 65.8498778092113


## f) Red neuronal

In [44]:
X.shape

(11999, 6)

In [54]:
red = models.Sequential()
red.add(layers.Dense(10, activation='relu', input_shape=(6,)))
red.add(layers.Dense(10, activation='relu'))
red.add(layers.Dense(10, activation='relu'))
red.add(layers.Dense(1,  activation='relu'))
red.compile(optimizer = optimizers.Adam(0.01),
            loss = 'mean_squared_logarithmic_error',
            metrics = 'mse')
x_tr, x_te, y_tr, y_te = train_test_split(X, Y.reshape(-1,1), test_size=0.3)
hist = red.fit(x_tr, y_tr,
               batch_size=x_tr.shape[0],
               epochs=100,
               verbose=0,
               validation_data=(x_te, y_te))
y = red.predict(X, verbose=0)
y_te_pr = red.predict(x_te, verbose=0)
print('RMSE:', mean_squared_error(Y, y.reshape(-1), squared=False))
print('RMSE:', mean_squared_error(y_te, y_te_pr, squared=False))
fig = go.Figure()
fig.add_trace(go.Scatter(y= Y[5000:5100], mode='lines+markers', name='test'))
fig.add_trace(go.Scatter(y= y.reshape(-1)[5000:5100], mode='markers', name='model'))
fig.show()

RMSE: 111.77291185775584
RMSE: 109.6964042771024


In [55]:
px.line(np.array(hist.history['loss']))

# Conclusión:

Tuvo un mejor desempeño **forest**

----

### Guardamos los scaladores y modelos a utilizar

In [58]:
# Escaladores
dump(scaler_we, 'models_2/scaler_we.bin', compress=True)
dump(scaler_hr, 'models_2/scaler_hr.bin', compress=True)
dump(scaler_ws, 'models_2/scaler_ws.bin', compress=True)
dump(scaler_mnth, 'models_2/scaler_mnth.bin', compress=True)
dump(scaler_atemp, 'models_2/scaler_atemp.bin', compress=True)
dump(scaler_dt, 'models_2/scaler_dt.bin', compress=True)
# Modelos
dump(tree_we, 'models_2/tree_we.pkl')
dump(tree_hr, 'models_2/tree_hr.pkl')
dump(tree_ws, 'models_2/tree_ws.pkl')
dump(tree_mnth, 'models_2/tree_mnth.pkl')
dump(tree_atemp, 'models_2/tree_atemp.pkl')
dump(regresion_dt, 'models_2/regresion_dt.pkl')
dump(forest, 'models_2/forest.pkl')

['models_2/forest.pkl']

## Creamos función que prepare los datos

In [59]:
def prediction(tabla):
    df_we = tabla[['weekday']].copy()
    df_we['weekend'] = df_we['weekday'].map(weekday_to_isweekend)
    df_we.drop(columns=['weekday'],inplace=True)
    X_we = np.array(df_we.values)
    df_hr = tabla[['hr']].copy()
    X_hr = np.array(df_hr.values)
    df_ws = tabla[['weathersit']].copy()
    X_ws = np.array(df_ws.values)
    df_mnth = tabla[['mnth']].copy()
    X_mnth = np.array(df_mnth.values)
    df_atemp = tabla[['atemp']].copy()
    X_atemp = np.array(df_atemp.values)
    df_dt = tabla[['dteday']].copy()
    df_dt['dteday'] = (df_dt['dteday'].map(dt.datetime.toordinal) - 734138)/365
    X_dt = np.array(df_dt.values)
    tree_we = load('models_2/tree_we.pkl')
    tree_hr = load('models_2/tree_hr.pkl')
    tree_ws = load('models_2/tree_ws.pkl')
    tree_mnth = load('models_2/tree_mnth.pkl')
    tree_atemp = load('models_2/tree_atemp.pkl')
    tree_dt = load('models_2/regresion_dt.pkl')   
    y_we = tree_we.predict(X_we)
    y_hr = tree_hr.predict(X_hr)
    y_ws = tree_ws.predict(X_ws)
    y_mnth = tree_mnth.predict(X_mnth)
    y_atemp = tree_atemp.predict(X_atemp)
    y_dt = tree_dt.predict(X_dt)
    scaler_we = load('models_2/scaler_we.bin')
    scaler_hr = load('models_2/scaler_hr.bin')
    scaler_ws = load('models_2/scaler_ws.bin')
    scaler_mnth = load('models_2/scaler_mnth.bin')
    scaler_atemp = load('models_2/scaler_atemp.bin')
    scaler_dt = load('models_2/scaler_dt.bin')
    scaled_we = scaler_we.transform(y_we.reshape(-1,1))
    scaled_hr = scaler_hr.transform(y_hr.reshape(-1,1))
    scaled_ws = scaler_ws.transform(y_ws.reshape(-1,1))
    scaled_mnth = scaler_mnth.transform(y_mnth.reshape(-1,1))
    scaled_atemp = scaler_atemp.transform(y_atemp.reshape(-1,1))
    scaled_dt = scaler_dt.transform(y_dt.reshape(-1,1))
    s_we = pd.Series(scaled_we.reshape(-1), name='we')
    s_hr = pd.Series(scaled_hr.reshape(-1), name='hr')
    s_ws = pd.Series(scaled_ws.reshape(-1), name='ws')
    s_mnth = pd.Series(scaled_mnth.reshape(-1), name='mnth')
    s_atemp = pd.Series(scaled_atemp.reshape(-1), name='atemp')
    s_dt = pd.Series(scaled_dt.reshape(-1), name='dt')
    x =  np.array(pd.concat([s_we, s_hr, s_ws, s_mnth, s_atemp, s_dt], axis=1).values)
    model = load('models_2/forest.pkl')
    return model.predict(X)

# Leemos y hacemos las predicciones

In [60]:
test = pd.read_excel('bike_test.xlsx')
Y_prediction = prediction(test)

In [61]:
df_pred = pd.DataFrame(Y_prediction.reshape(-1), columns=['pred'])
df_pred.to_csv('models_2/FcoCervantesRdz.csv', index=False)