In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [None]:
dataset = pd.read_csv('hour.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset['dteday'] = pd.to_datetime(dataset['dteday'])


In [None]:
dataset.isnull().sum()


In [None]:
dataset.drop(['casual', 'registered'], axis=1, inplace=True)


In [None]:
dataset['cnt'].describe()


In [None]:
sns.histplot(dataset['cnt'], bins=50, kde=True)
plt.show()


In [None]:
dataset.groupby('hr')['cnt'].mean().plot(figsize=(8,4))
plt.show()


In [None]:
sns.boxplot(x='weathersit', y='cnt', data=dataset)
plt.show()


In [None]:
sns.boxplot(x='workingday', y='cnt', data=dataset)
plt.show()


In [None]:
dataset['year'] = dataset['dteday'].dt.year
dataset['month'] = dataset['dteday'].dt.month
dataset['day'] = dataset['dteday'].dt.day
dataset['weekday'] = dataset['dteday'].dt.weekday


In [None]:
dataset.drop('dteday', axis=1, inplace=True)


In [None]:
dataset['hr_sin'] = np.sin(2*np.pi*dataset['hr']/24)
dataset['hr_cos'] = np.cos(2*np.pi*dataset['hr']/24)
dataset.drop('hr', axis=1, inplace=True)


In [None]:
dataset[['temp','atemp']].corr()


In [None]:
X = dataset.drop('cnt', axis=1)
y = dataset['cnt']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)


In [None]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42
)
rf.fit(X_train, y_train)


In [None]:
pd.Series(rf.feature_importances_, index=X.columns)\
  .sort_values(ascending=False)


In [None]:

dataset[['temp', 'atemp']].corr()


     

In [None]:
dataset.drop('atemp', axis=1, inplace=True)


In [None]:
dataset['bad_weather_workday'] = (
    (dataset['weathersit'] > 2) & (dataset['workingday'] == 1)
).astype(int)


In [None]:
dataset['temp_season_interaction'] = dataset['temp'] * dataset['season']


In [None]:
sns.histplot(dataset['cnt'], kde=True)
plt.show()


In [None]:
dataset['cnt_log'] = np.log1p(dataset['cnt'])


In [None]:
dataset.info()
dataset.head()


In [None]:
X = dataset.drop('cnt', axis=1)
y = dataset['cnt']


In [None]:
X.columns


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False
)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)


In [None]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

r2_lr = r2_score(y_test, y_pred_lr)

rmse_lr, r2_lr


In [None]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)


In [None]:
print([name for name in globals() if 'rf' in name])


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [None]:
y_pred_rf = rf.predict(X_test)


In [None]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

gb.fit(X_train, y_train)


In [None]:
y_pred_gb = gb.predict(X_test)


In [None]:
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

rmse_gb, r2_gb


In [None]:
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))


In [None]:
y_pred_rf = rf.predict(X_test)


In [None]:
print("LR:", 'y_pred_lr' in globals())
print("RF:", 'y_pred_rf' in globals())
print("GB:", 'y_pred_gb' in globals())


In [None]:
# Linear Regression
y_pred_lr = lr.predict(X_test)

# Random Forest
y_pred_rf = rf.predict(X_test)

# Gradient Boosting
y_pred_gb = gb.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

results = []

for name, y_pred in [
    ('Linear Regression', y_pred_lr),
    ('Random Forest', y_pred_rf),
    ('Gradient Boosting', y_pred_gb)
]:
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'RMSE': rmse,
        'R2': r2
    })

results_df = pd.DataFrame(results)
results_df


In [None]:
results_df.sort_values('RMSE')


In [None]:
print(y_pred_rf[:10])



In [None]:
X.columns


In [None]:
pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred_gb[:10]
})


In [None]:
pip install shap


In [None]:
import shap

In [None]:
X_train.shape, X_test.shape


In [None]:
explainer = shap.TreeExplainer(gb)
shap_values = explainer.shap_values(X_test)


In [None]:
shap.summary_plot(shap_values, X_test)


In [None]:
shap.dependence_plot('temp', shap_values, X_test)


In [None]:
shap.dependence_plot('weathersit', shap_values, X_test)
shap.dependence_plot('workingday', shap_values, X_test)


In [None]:
shap.initjs()


In [None]:
shap_values.shape, X_test.shape


In [None]:
residuals = y_test - y_pred_gb

plt.figure(figsize=(8,5))
plt.scatter(y_pred_gb, residuals, alpha=0.4)
plt.axhline(0)
plt.xlabel("Predicted Demand")
plt.ylabel("Residuals")
plt.title("Residual Analysis")
plt.show()


In [None]:
mape = np.mean(np.abs((y_test - y_pred_gb) / y_test)) * 100
mape


In [None]:
dataset.index.is_monotonic_increasing


In [None]:
for lag in [1, 2, 3, 24, 48, 168]:  # 1h, 1d, 1 week
    dataset[f'cnt_lag_{lag}'] = dataset['cnt'].shift(lag)


In [None]:
dataset['rolling_mean_24'] = dataset['cnt'].shift(1).rolling(24).mean()
dataset['rolling_std_24'] = dataset['cnt'].shift(1).rolling(24).std()


In [None]:
dataset_ts = dataset.dropna()


In [None]:
cd path/to/Bike-Demand-Prediction


In [None]:
pip install nbstripout

In [None]:
nbstripout --install