In [2]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip
!pip install category_encoders

--2024-08-27 11:00:57--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [<=>                 ]       0  --.-KB/s               bike+sharing+datase     [ <=>                ] 273.43K  --.-KB/s    in 0.05s   

2024-08-27 11:00:57 (5.71 MB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                
Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m 

In [130]:
import pandas as pd
df = pd.read_csv('hour.csv')
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)


**Q1 : Create at least two new interaction features between numerical variables (e.g.,
temp * hum). Justify your choice of features and explain how they might
improve the model's predictive performance.**

In [131]:
df['temp_hum'] = df['temp'] * df['hum']
df['temp_windspeed'] = df['atemp'] * df['windspeed']

High temprature and high humidity may decrease the bike rentals on that day and
moderate temprature and moderate humidity may increase the amount of bike rentals per day as the weather is more pleasant, more people might come out from homes, thus the feature temp_hum can affect the model performance.

A mild day might feel much colder with strong winds, potentially decreasing bike rentals. On the other hand, moderate wind could make hot days feel cooler and more pleasant for biking. This interaction can capture the compounded effect of wind and temperature on the decision to rent bikes.

These interaction terms allow the model to account for non-linear relationships between the predictors and the target variable. Instead of assuming a linear effect of temperature or windspeed independently, the model can now learn how these variables combine to influence bike rentals, thus improving the genralized performance of the model.

In [132]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


def run_pipeline(dataframe, numerical_features, categorical_features,encoding, model, custom_model = ''):
  # Separating features and target variable
  X = dataframe.drop(columns=['cnt']) # Features
  Y = dataframe['cnt'] # Target
  encoderObject = OneHotEncoder(sparse_output=False, drop='first') if encoding == 'one-hot' else TargetEncoder(return_df = True)
  modelObject = RandomForestRegressor(n_estimators=100, random_state=42) if model == 'random-forest' else LinearRegression()
  if model == 'linear_scratch':
    modelObject = custom_model()

  numerical_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='mean')),
  ('scaler', MinMaxScaler())
  ])
  X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

  categorical_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ( encoding, encoderObject)
  ])

  X_encoded = categorical_pipeline.fit_transform(X[categorical_features], Y)
  X_encoded = pd.DataFrame(X_encoded,
  columns=categorical_pipeline.named_steps[encoding].get_feature_names_out(categorical_features))
  X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

  X.columns = X.columns.astype(str)  # Convert all column names to strings

  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,
  random_state=42)
  modelObject.fit(X_train, y_train)
  y_pred = modelObject.predict(X_test)

  full_pipeline = Pipeline([
      ('num_preprocess', numerical_pipeline),
      ('cat_preprocess', categorical_pipeline),
      ('model', modelObject)
  ])

  return full_pipeline, modelObject, y_test, y_pred


**Replace the OneHotEncoder with TargetEncoder for categorical variables.
Evaluate how this change impacts the model's performance compared to one-hot
encoding.**

With One Hot Encoding

In [145]:
pipeline, model, y_test, y_pred = run_pipeline(df, ['temp', 'hum', 'windspeed', 'temp_windspeed', 'temp_hum'], ['season', 'weathersit', 'day_night'], 'one-hot', 'random-forest');

# Performance

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 1859.7503947386544
R-squared: 0.9412687119400371


In [146]:
pipeline

With Target Encoding

In [147]:
pipeline, model, y_test, y_pred = run_pipeline(df, ['temp', 'hum', 'windspeed', 'temp_windspeed', 'temp_hum'], ['season', 'weathersit', 'day_night'], 'target', 'random-forest');

# Performance

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1793.641308661706
R-squared: 0.9433564500519636


In [148]:
pipeline

We can see with Targe Encoding, we are getting a less Mean Squared Error compared to One Hot Encoding, but R-sqaure in Target Encoding is a little bit higher than One Hot Encoding, overall Target Encoding seems to perform better here.

**Train LinearRegressor:
a. Using the package,
b. Write/Train it by scratch following the steps of a linear regressor.
Compare their performance using metrics like Mean Squared Error (MSE)
and R-squared.**

Training Linear Regression with package

In [137]:
pipeline, model, y_test, y_pred = run_pipeline(df, ['temp', 'hum', 'windspeed', 'temp_windspeed', 'temp_hum'], ['season', 'weathersit', 'day_night'], 'target', 'linear-regression');

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Sklearn Linear Regression - Mean Squared Error: {mse}')
print(f'Sklearn Linear Regression - R-squared: {r2}')



Sklearn Linear Regression - Mean Squared Error: 14973.691511022034
Sklearn Linear Regression - R-squared: 0.5271278382610913


In [138]:
pipeline

Training Linear Regression from Scratch

In [139]:
class LinearRegressionFromScratch:
  def __init__(self):
    self.w = None
  def fit(self, X, y):
    A = X.T @ X
    b = X.T @ y
    self.w = np.linalg.inv(A) @ b

  def predict(self, X):
    return X @ self.w




In [140]:
import numpy as np
pipeline, model, y_test, y_pred = run_pipeline(df, ['temp', 'hum', 'windspeed', 'temp_windspeed', 'temp_hum'], ['season', 'weathersit', 'day_night'], 'target', 'linear_scratch', LinearRegressionFromScratch)

# Performance

mse_test = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f'Scratch Linear Regression - Test Mean Squared Error: {mse_test}')
print(f'Scratch Linear Regression - Test R-squared: {r2_test}')


Scratch Linear Regression - Test Mean Squared Error: 15467.611031944612
Scratch Linear Regression - Test R-squared: 0.5115297613665739


In [141]:
pipeline