# Model Training

## Imports

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
import numpy as np


## Preprocessing

In [9]:
# Load the data
yulu_data_set = pd.read_csv('yulu_bike_sharing_dataset.csv')

# Split the 'datetime' column into 'date' and 'time' columns
yulu_data_set[['date', 'time']] = yulu_data_set['datetime'].str.split(' ', expand=True)
yulu_data_set = yulu_data_set.drop(columns=['datetime'])

# Specify the names of categorical and numerical columns
categorical_columns = ['season', 'holiday', 'workingday', 'weather']  # Update with your actual categorical columns
numerical_columns = ['temp', 'atemp', 'humidity', 'windspeed']  # Update with your actual numerical columns
target_column = 'count'  # Update if your target variable is named differently

# Separating the target variable
X = yulu_data_set.drop(columns=[target_column])
y = yulu_data_set[target_column]

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a column transformer for handling both numerical and categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])


## Linear Regression

In [10]:
# Creating a pipeline with preprocessing and a simple linear regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Define scoring metrics for cross-validation
scoring_metrics = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'R2': 'r2'
}

# Apply cross-validation
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring_metrics, return_train_score=True)

print("Cross-validated scores:")
for key, scores in cv_results.items():
    if "time" not in key:
        print(f"{key}: {np.mean(scores)}")

# Fit the model on training data
model.fit(X_train, y_train)

# Predictions and model evaluation on the test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test MSE:", mse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Cross-validated scores:
test_MSE: -27796.90202885234
train_MSE: -23356.980583099423
test_MAE: -124.64762225454452
train_MAE: -114.3415240643287
test_R2: -0.05086788194556213
train_R2: 0.2823972654796753
Test MSE: 23517.898148129236
Test MAE: 114.0271227430039
Test R-squared: 0.28748617429098655


## Random Forest Regressor

In [11]:
# Creating a pipeline with preprocessing and a RandomForest regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Cross-validation to evaluate model
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validated MSE:", -np.mean(scores))

# Fit the model on training data
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared:", r2)

Cross-validated MSE: 32075.29105559666
Mean Squared Error (MSE): 22898.59570693373
Mean Absolute Error (MAE): 107.4550599161832
R-squared: 0.3062489714111978
