# Predicting Bike Sharing Demand

In [None]:
from statistics import linear_regression

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import mlflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
# from sklearn.metrics import mean_squared_error
# from sklearn.compose import ColumnTransformer
# from sk


from matplotlib.pyplot import ylabel
# from prompt_toolkit.shortcuts import set_title

# Data Exploration<br>

## Import Data

In [None]:
df_hour = pd.read_csv('bike+sharing+dataset/hour.csv')
# df_day = pd.read_csv('bike+sharing+dataset/day.csv')


## Overview of the Data

In [None]:
df_hour.head()

## Look for missing Data / compare entries with Non-Null Count

In [None]:
df_hour.info()

## convert dteday to datetime

In [None]:
df_hour['dteday'] = pd.to_datetime(df_hour['dteday'])

## Statistics

In [None]:
df_hour.describe(include='number').style.background_gradient(cmap='Pastel1')

## Time-based analysis

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(15,5))
# plt.figure(figsize=(8, 5))
sns.lineplot(data=df_hour, x='dteday', y='registered', label='Registered', ax=ax0)
sns.lineplot(data=df_hour, x='dteday', y='casual', label='Casual', ax=ax0)

ax0.set_xlabel('Date')
ax0.set_ylabel('Users')

ax0.set_title('Registered vs Casual')

sns.lineplot(data=df_hour, x='hr', y='registered', label='Registered', ax=ax1)
sns.lineplot(data=df_hour, x='hr', y='casual', label='Casual', ax=ax1)

ax1.set_xlabel('Hour')
ax1.set_ylabel('Users')
ax1.set_title('Registered vs Casual')


More registered users than casual users. The number of registered useres is raising more over 1 year than of the casual users

In [None]:
time_list = ['yr', 'season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

name_list = ['Year', 'Season', 'Month', 'Holiday', 'Weekday', 'Workingday', 'Weathersit']

fig, axes = plt.subplots(2, 4, figsize=(20,10))
axes = axes.flatten()
for i, col in enumerate(time_list):
    sns.barplot(data=df_hour, x=col, y='registered', ax=axes[i], label='Registered')
    sns.barplot(data=df_hour, x=col, y='casual', ax=axes[i], label='Casual')
    axes[i].set_xlabel(name_list[i])
    axes[i].set_ylabel('Users per Hour')

axes[7].set_visible(False)


Year: Bigger growth in registered users.<br>
Season: Season 3 is the strongest. Season 1 the weakest.<br>
Month: first month is the weakest. Ninth month the strongest.<br>
Holiday: fewer registered users on holidays but more casual users. Still less in general.<br>
Weekday: Day0 has the fewest users.<br>
Workingday: On working days the majority of users are registered. On non-working days its 50/50.<br>
Weathersit: most users Weathersit: 1.

## Distribution of Temperature / Correlation with users

In [None]:
lst_values = ['temp', 'atemp', 'hum', 'windspeed']
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20,5))

for i, col in enumerate(lst_values):
    sns.kdeplot(data=df_hour[col], fill=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
plt.show()
lst_values = ['temp', 'atemp', 'hum', 'windspeed']
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20,5))

for i, value in enumerate(lst_values):
    sns.lineplot(data=df_hour, x=value, y='registered', ax=axes[i], label='Registered')
    sns.lineplot(data=df_hour, x=value, y='casual', ax=axes[i], label='Casual')
    axes[i].set_ylabel('Users')

There is no significant correlation

## Correlation

In [None]:
numeric_df = df_hour.select_dtypes(include=['float64', 'int64'])
corr = numeric_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
plt.figure(figsize=(10, 6))
sns.heatmap(corr, mask=mask, center=0, vmin=-1, vmax=1, annot=True, fmt='.2f', cmap='coolwarm')
plt.tight_layout()

## Clean Data

In [None]:
# df_hour = df_hour.drop(columns='instant')
# df_hour = df_hour.drop(columns='atemp')
df_hour = df_hour.drop(columns=['instant', 'atemp', 'dteday'])
# df_hour = df_hour.drop(columns='dteday')
df_hour['weathersit'] = df_hour['weathersit'].replace({1: 'clear', 2: 'mist', 3: 'light snow', 4: 'heavy rain'})

#should I add hour?
df_encoded = pd.get_dummies(df_hour, columns=['season', 'mnth', 'weekday', 'weathersit'], dtype=int)

numeric_df = df_encoded.select_dtypes(include=['float64', 'int64'])
corr = numeric_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
plt.figure(figsize=(20, 12))
# sns.heatmap(corr, mask=mask, center=0, vmin=-1, vmax=1, annot=True, fmt='.2f', cmap='coolwarm')
df_encoded.sample()

In [None]:
df_hour

## Create features and target variable

In [None]:
mlflow.set_tracking_uri('http://localhost:5000')
X = df_encoded.drop(columns='registered')
y = df_encoded['registered']
# X = X.set_index('dteday')
# y = y.set_index('dteday')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_train)
x_val_scaled = scaler.transform(X_val)
print("mlflow before loop")

with mlflow.start_run(run_name='linear_regression'):
    print("mlflow loop")
    model = RandomForestRegressor
    # model = LinearRegression()
    model.fit(x_train_scaled, y_train)
    # model.fit(X_train, y_train)

    y_pred = model.predict(x_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    mlflow.log_param('model_type', "Linear Regression")
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('r2', r2)


    signature = mlflow.models.infer_signature(X_val, y_pred)
    mlflow.sklearn.log_model(
        model,
        'linear_regression'
        # signature=signature,
        # input_example=x_train_scaled.iloc[0:1]
    )

    print(f"Logged to MLflow: MSE={mse:.2f}, R2={r2:.2f}")



In [None]:
# def tuning(model, param_grid, X, y):
