In [79]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import numpy as np

# Preprocessing

In [80]:
passerbys = pd.read_csv(r"C:\Users\Lenovo\Documents\passerbys.csv")
weather = pd.read_csv(r"C:\Users\Lenovo\Documents\weather.csv")

weather.isna().sum()
passerbys.isna().sum()

weather['DATE'] = pd.to_datetime(weather['DATE'])
weather = weather.set_index('DATE', drop=False)
passerbys['Date'] = pd.to_datetime(passerbys['Date'])
passerbys = passerbys.set_index('Date', drop=False)

passerbys = passerbys.resample('D').sum()  # аггрецаия данных

passerbys['d'] = passerbys.index.dayofweek

df = pd.merge(passerbys, weather, left_index=True, right_index=True)  # датасет со всеми фичами

df = pd.merge(df, pd.get_dummies(df['d'], prefix='d'), left_index=True, right_index=True).dropna()

cal = calendar()
tmp = cal.holidays(start=df['DATE'].min(), end=df['DATE'].max())
df['hd'] = df['DATE'].isin(tmp).astype(int)

df['t'] = range(1,len(df)+1) # trend

#X1 = df[['TAVG', 'PRCP', 'd_0', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6']]
#X2 = df[['TAVG', 'PRCP', 'd_0', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'hd']]
X3 = df[['TAVG', 'PRCP', 'd_0', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'hd', 't']]
y1 = df['Fremont Bridge East Sidewalk']
y2 = df['Fremont Bridge West Sidewalk']

In [81]:
X3

Unnamed: 0,TAVG,PRCP,d_0,d_1,d_2,d_3,d_4,d_5,d_6,hd,t
2013-04-01,12.8,0.0,1,0,0,0,0,0,0,0,1
2013-04-02,11.5,0.0,0,1,0,0,0,0,0,0,2
2013-04-03,11.7,0.0,0,0,1,0,0,0,0,0,3
2013-04-04,12.0,8.4,0,0,0,1,0,0,0,0,4
2013-04-05,12.2,18.5,0,0,0,0,1,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...
2019-06-24,14.7,0.0,1,0,0,0,0,0,0,0,2276
2019-06-25,16.6,0.0,0,1,0,0,0,0,0,0,2277
2019-06-28,16.1,0.0,0,0,0,0,1,0,0,0,2278
2019-06-29,18.0,0.0,0,0,0,0,0,1,0,0,2279


# Model

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X3, y1, test_size=0.3, random_state=23)

In [83]:
ridge = Ridge()
lasso = Lasso()

In [84]:
params = {
    'alpha': np.arange(0.1, 5,0.1),
    'normalize':[True]
}

In [85]:
def regression_results(y_true, y_pred):
    # Regression metrics
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred)
    mse=metrics.mean_squared_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    return {'r2':round(r2,4),
    'MAE':round(mean_absolute_error,4),
    'MSE': round(mse,4),
    'RMSE':round(np.sqrt(mse),4)}

In [86]:
results = []
grid = GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=5,n_jobs=-1,verbose=1)
grid.fit(X_train, y_train)
results.append(regression_results(y_test,grid.predict(X_test)))
model = grid.best_estimator_.fit(X_train, y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

In [87]:
grid.best_params_

{'alpha': 0.1, 'normalize': True}

In [88]:
model.coef_

array([ 5.04521097e+01, -2.47739921e+01,  2.17667949e+02,  2.65427447e+02,
        2.54723195e+02,  1.84819861e+02,  4.03070960e+01, -4.50096336e+02,
       -5.03719937e+02, -5.02866291e+02, -1.21785105e-01])

In [89]:
grid = GridSearchCV(lasso, params, scoring='neg_mean_squared_error', cv=5,n_jobs=-1,verbose=1)
grid.fit(X_train, y_train)
results.append(regression_results(y_test,grid.predict(X_test)))
model = grid.best_estimator_.fit(X_train, y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set para

In [90]:
model.coef_

array([ 5.44461070e+01, -2.59933810e+01,  2.43824130e+01,  7.29045058e+01,
        6.31296386e+01,  0.00000000e+00, -1.48901308e+02, -6.81919977e+02,
       -7.40095433e+02, -5.34465948e+02, -1.25348555e-01])

In [91]:
grid.best_params_

{'alpha': 0.1, 'normalize': True}

In [92]:
regr = LinearRegression(normalize=True)
regr.fit(X_train, y_train)
results.append(regression_results(y_test,regr.predict(X_test)))


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [93]:
pd.DataFrame(results)

Unnamed: 0,r2,MAE,MSE,RMSE
0,0.7481,228.0746,88877.8091,298.1238
1,0.7512,226.0883,87788.0612,296.2905
2,0.7503,226.2311,88103.4025,296.8222


# Вывод
Модели с регуляризацией показали примерно те же, что и ols. Однако на практике регуляризация помогает справиться с ошибкой переобучения.