In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from pandas.tseries.holiday import USFederalHolidayCalendar

# Load and preprocess data
bicycle_weather_df = pd.read_csv('BicycleWeather.csv')
fremont_bridge_df = pd.read_csv('FremontBridge.csv')

# Convert date columns to datetime format
bicycle_weather_df['DATE'] = pd.to_datetime(bicycle_weather_df['DATE'].astype(str), format='%Y%m%d')
fremont_bridge_df['Date'] = pd.to_datetime(fremont_bridge_df['Date'], format='%m/%d/%Y %I:%M:%S %p')

# Group by date and sum only the 'Fremont Bridge East Sidewalk' and 'Fremont Bridge West Sidewalk' columns
daily_counts = fremont_bridge_df.groupby(fremont_bridge_df['Date'].dt.date)[['Fremont Bridge East Sidewalk', 'Fremont Bridge West Sidewalk']].sum()

# Calculate 'holiday'
cal = USFederalHolidayCalendar()
holidays = cal.holidays('2012', '2016')
daily_counts['holiday'] = daily_counts.index.isin(holidays)
daily_counts['holiday'] = daily_counts['holiday'].astype(int)

# Calculate 'daylight_hrs'
def hours_of_daylight(date, axis=23.44, latitude=47.61):
    days = (pd.Timestamp(date) - pd.Timestamp('2000-12-21')).days
    m = (1. - np.tan(np.radians(latitude))
         * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.

daily_counts['daylight_hrs'] = list(map(hours_of_daylight, daily_counts.index))

# Reset index to convert the Date index to a column
daily_counts.reset_index(inplace=True)

# Rename the new column to 'Date'
daily_counts.rename(columns={'index': 'Date'}, inplace=True)

# Convert the 'Date' column to datetime format
daily_counts['Date'] = pd.to_datetime(daily_counts['Date'])

# Merge with weather data
final_df = pd.merge(daily_counts, bicycle_weather_df, left_on='Date', right_on='DATE', how='inner')

# Create feature matrix X and target vector y
X = final_df[['holiday', 'daylight_hrs', 'PRCP', 'TMAX', 'TMIN']]  # Add other relevant features
y = final_df['Fremont Bridge East Sidewalk'] + final_df['Fremont Bridge West Sidewalk']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
linear_model = LinearRegression()
lasso_model = Lasso(random_state=42)
ridge_model = Ridge(random_state=42)

# Parameter grid
param_dist = {'alpha': np.logspace(-4, 4, 50)}

# RandomizedSearchCV
random_search_lasso = RandomizedSearchCV(lasso_model, param_distributions=param_dist, n_iter=10, cv=10, random_state=42)
random_search_ridge = RandomizedSearchCV(ridge_model, param_distributions=param_dist, n_iter=10, cv=10, random_state=42)

# 10-fold CV for Linear Regression
linear_scores = cross_val_score(linear_model, X_train, y_train, cv=10)

# Fit RandomizedSearchCV and get best scores
random_search_lasso.fit(X_train, y_train)
lasso_best_score = random_search_lasso.best_score_
random_search_ridge.fit(X_train, y_train)
ridge_best_score = random_search_ridge.best_score_

lasso_alpha = random_search_lasso.best_params_
ridge_alpha = random_search_ridge.best_params_


# Print 10-fold CV scores and best alpha values for all three models
print(f'Linear Regression 10-fold CV scores: {linear_scores}\n')
print(f'Lasso 10-fold CV scores: {random_search_lasso.cv_results_["mean_test_score"]}\n')
print(f'Ridge 10-fold CV scores: {random_search_ridge.cv_results_["mean_test_score"]}\n')

# Print best 10-fold CV scores and best alpha values for all three models
print(f'Linear Regression best 10-fold CV score: {np.max(linear_scores)}')
print(f'Lasso best 10-fold CV score: {lasso_best_score} with best alpha: {lasso_alpha}')
print(f'Ridge best 10-fold CV score: {ridge_best_score} with best alpha: {ridge_alpha}')


Linear Regression 10-fold CV scores: [0.51549417 0.59194859 0.52769114 0.42321086 0.52335078 0.57886809
 0.53996163 0.40905541 0.43333419 0.53046319]

Lasso 10-fold CV scores: [0.50733788 0.49425792 0.50736728 0.48536485 0.50733815 0.47612494
 0.50734724 0.50734446 0.50736099 0.50733854]

Ridge 10-fold CV scores: [0.50733783 0.50746848 0.50735272 0.50270053 0.50733792 0.49628243
 0.5073412  0.50734014 0.50736838 0.50733805]

Linear Regression best 10-fold CV score: 0.5919485899728966
Lasso best 10-fold CV score: 0.5073672812218897 with best alpha: {'alpha': 7.9060432109076855}
Ridge best 10-fold CV score: 0.5074684833885427 with best alpha: {'alpha': 232.99518105153672}
