In [14]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [15]:
# Load the datasets using Pandas
weather_data = pd.read_csv("HourlyWeather.csv")
station_data = pd.read_csv("Availability.csv")

In [16]:
weather_data.dtypes

DateTime         object
ForecastDate     object
FeelsLike       float64
Humidity          int64
Pop             float64
Pressure          int64
Temperature     float64
UVI             float64
WeatherId         int64
WindSpeed       float64
WindGust        float64
Rain1h          float64
Snow1h          float64
dtype: object

In [17]:
station_data

Unnamed: 0,StationId,Status,MechanicalBikesAvailable,ElectricBikesAvailable,StandsAvailable,LastUpdated
0,1,OPEN,11,18,2,2024-03-05T14:01:57Z
1,1,OPEN,8,2,21,2024-03-06T08:49:11Z
2,1,OPEN,4,3,24,2024-03-06T15:54:38Z
3,1,OPEN,3,2,26,2024-03-06T16:07:46Z
4,1,OPEN,3,4,24,2024-03-06T16:35:06Z
...,...,...,...,...,...,...
995,12,OPEN,8,1,11,2024-03-06T19:53:14Z
996,12,OPEN,8,2,10,2024-03-06T20:03:22Z
997,12,OPEN,10,2,8,2024-03-06T20:15:30Z
998,12,OPEN,10,2,8,2024-03-06T20:33:39Z


In [18]:
station_data.dtypes

StationId                    int64
Status                      object
MechanicalBikesAvailable     int64
ElectricBikesAvailable       int64
StandsAvailable              int64
LastUpdated                 object
dtype: object

In [19]:
weather_data

Unnamed: 0,DateTime,ForecastDate,FeelsLike,Humidity,Pop,Pressure,Temperature,UVI,WeatherId,WindSpeed,WindGust,Rain1h,Snow1h
0,2024/3/7 7:41,2024/3/7 8:00,4.09,85,0.00,1019,7.12,,803,4.77,9.83,,
1,2024/3/7 7:41,2024/3/7 9:00,3.92,84,0.00,1019,7.19,,803,5.38,10.62,,
2,2024/3/7 7:41,2024/3/7 10:00,3.94,83,0.00,1019,7.43,,803,6.10,10.76,,
3,2024/3/7 7:41,2024/3/7 11:00,4.46,79,0.00,1020,7.89,,803,6.30,10.58,,
4,2024/3/7 7:41,2024/3/7 12:00,4.88,75,0.00,1019,8.27,,802,6.48,10.92,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2024/3/7 17:00,2024/3/9 5:00,0.70,89,0.00,999,5.73,,804,9.45,14.42,,
996,2024/3/7 17:00,2024/3/9 6:00,0.74,88,0.00,998,5.81,,804,9.70,14.93,,
997,2024/3/7 17:00,2024/3/9 7:00,0.76,88,0.12,998,5.89,,804,10.05,15.39,,
998,2024/3/7 17:00,2024/3/9 8:00,0.91,88,0.21,998,6.02,,804,10.12,14.96,,


In [20]:
# Preprocess the data
weather_data['time'] = pd.to_datetime(weather_data['ForecastDate'])
station_data['last_update'] = pd.to_datetime(station_data['LastUpdated'])

In [21]:
# Extract day and hour
weather_data['year'] = weather_data['time'].dt.year
weather_data['month'] = weather_data['time'].dt.month
weather_data['day'] = weather_data['time'].dt.day
weather_data['hour'] = weather_data['time'].dt.hour
weather_data['minute'] = weather_data['time'].dt.minute
weather_data['is_weekday'] = ((weather_data['time'].dt.weekday >= 0) & (weather_data['time'].dt.weekday <= 4)).astype(int)


station_data['year'] = station_data['last_update'].dt.year
station_data['month'] = station_data['last_update'].dt.month
station_data['day'] = station_data['last_update'].dt.day
station_data['hour'] = station_data['last_update'].dt.hour
station_data['minute'] = station_data['last_update'].dt.minute

In [8]:
weather_data

Unnamed: 0,DateTime,ForecastDate,FeelsLike,Humidity,Pop,Pressure,Temperature,UVI,WeatherId,WindSpeed,WindGust,Rain1h,Snow1h,time,year,month,day,hour,minute,is_weekday
0,2024/3/7 7:41,2024/3/7 8:00,4.09,85,0.00,1019,7.12,,803,4.77,9.83,,,2024-03-07 08:00:00,2024,3,7,8,0,1
1,2024/3/7 7:41,2024/3/7 9:00,3.92,84,0.00,1019,7.19,,803,5.38,10.62,,,2024-03-07 09:00:00,2024,3,7,9,0,1
2,2024/3/7 7:41,2024/3/7 10:00,3.94,83,0.00,1019,7.43,,803,6.10,10.76,,,2024-03-07 10:00:00,2024,3,7,10,0,1
3,2024/3/7 7:41,2024/3/7 11:00,4.46,79,0.00,1020,7.89,,803,6.30,10.58,,,2024-03-07 11:00:00,2024,3,7,11,0,1
4,2024/3/7 7:41,2024/3/7 12:00,4.88,75,0.00,1019,8.27,,802,6.48,10.92,,,2024-03-07 12:00:00,2024,3,7,12,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2024/3/7 17:00,2024/3/9 5:00,0.70,89,0.00,999,5.73,,804,9.45,14.42,,,2024-03-09 05:00:00,2024,3,9,5,0,0
996,2024/3/7 17:00,2024/3/9 6:00,0.74,88,0.00,998,5.81,,804,9.70,14.93,,,2024-03-09 06:00:00,2024,3,9,6,0,0
997,2024/3/7 17:00,2024/3/9 7:00,0.76,88,0.12,998,5.89,,804,10.05,15.39,,,2024-03-09 07:00:00,2024,3,9,7,0,0
998,2024/3/7 17:00,2024/3/9 8:00,0.91,88,0.21,998,6.02,,804,10.12,14.96,,,2024-03-09 08:00:00,2024,3,9,8,0,0


In [60]:
weather_data.columns

Index(['DateTime', 'ForecastDate', 'FeelsLike', 'Humidity', 'Pop', 'Pressure',
       'Temperature', 'UVI', 'WeatherId', 'WindSpeed', 'WindGust', 'Rain1h',
       'Snow1h', 'time', 'year', 'month', 'day', 'hour', 'minute',
       'is_weekday'],
      dtype='object')

In [22]:
# Merge the datasets
merged_data = pd.merge(station_data, weather_data, on = ['year','month','day','hour'])

In [23]:
# Drop every redundent columns
merged_data = merged_data.drop(columns=['Status','LastUpdated', 'DateTime','FeelsLike','Pop','UVI','WindGust','Rain1h','Snow1h',])
merged_data = merged_data.drop(columns=['last_update'])
merged_data = merged_data.drop(columns=['ForecastDate','time'])

In [25]:
merged_data

Unnamed: 0,StationId,MechanicalBikesAvailable,ElectricBikesAvailable,StandsAvailable,year,month,day,hour,minute_x,Humidity,Pressure,Temperature,WeatherId,WindSpeed,minute_y,is_weekday
0,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1
1,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1
2,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1
3,1,4,1,26,2024,3,7,14,43,74,1019,8.70,801,6.32,0,1
4,1,4,1,26,2024,3,7,14,43,75,1019,8.68,802,6.32,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,11,15,9,6,2024,3,7,14,42,72,1019,8.89,800,6.32,0,1
73,11,15,9,6,2024,3,7,14,42,74,1019,8.70,801,6.32,0,1
74,11,15,9,6,2024,3,7,14,42,75,1019,8.68,802,6.32,0,1
75,11,15,9,6,2024,3,7,14,42,75,1019,8.67,802,6.32,0,1


In [24]:
merged_data.dtypes

StationId                     int64
MechanicalBikesAvailable      int64
ElectricBikesAvailable        int64
StandsAvailable               int64
year                          int64
month                         int64
day                           int64
hour                          int64
minute_x                      int64
Humidity                      int64
Pressure                      int64
Temperature                 float64
WeatherId                     int64
WindSpeed                   float64
minute_y                      int64
is_weekday                    int32
dtype: object

In [27]:
# Engineer is_busy_hours feature
merged_data['is_busy_hours'] = ((merged_data['hour'] >= 7) & (merged_data['hour'] <= 10)) | ((merged_data['hour'] >= 16) & (merged_data['hour'] <= 19)).astype(int)

In [29]:
merged_data['cold_weather'] = (merged_data['Temperature'] < 5).astype(float)
merged_data['windy_weather'] = (merged_data['WindSpeed'] > 8).astype(float)

In [30]:
merged_data

Unnamed: 0,StationId,MechanicalBikesAvailable,ElectricBikesAvailable,StandsAvailable,year,month,day,hour,minute_x,Humidity,Pressure,Temperature,WeatherId,WindSpeed,minute_y,is_weekday,is_busy_hours,cold_weather,windy_weather
0,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1,False,0.0,0.0
1,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1,False,0.0,0.0
2,1,4,1,26,2024,3,7,14,43,72,1019,8.89,800,6.32,0,1,False,0.0,0.0
3,1,4,1,26,2024,3,7,14,43,74,1019,8.70,801,6.32,0,1,False,0.0,0.0
4,1,4,1,26,2024,3,7,14,43,75,1019,8.68,802,6.32,0,1,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,11,15,9,6,2024,3,7,14,42,72,1019,8.89,800,6.32,0,1,False,0.0,0.0
73,11,15,9,6,2024,3,7,14,42,74,1019,8.70,801,6.32,0,1,False,0.0,0.0
74,11,15,9,6,2024,3,7,14,42,75,1019,8.68,802,6.32,0,1,False,0.0,0.0
75,11,15,9,6,2024,3,7,14,42,75,1019,8.67,802,6.32,0,1,False,0.0,0.0


In [31]:
merged_data.dtypes

StationId                     int64
MechanicalBikesAvailable      int64
ElectricBikesAvailable        int64
StandsAvailable               int64
year                          int64
month                         int64
day                           int64
hour                          int64
minute_x                      int64
Humidity                      int64
Pressure                      int64
Temperature                 float64
WeatherId                     int64
WindSpeed                   float64
minute_y                      int64
is_weekday                    int32
is_busy_hours                  bool
cold_weather                float64
windy_weather               float64
dtype: object

In [15]:
unique_stations = list(merged_data['StationId'].unique())
unique_stations.sort()

In [16]:
for station_id in unique_stations:
    # station_id=int(station_id)
    station_data = merged_data[merged_data['StationId'] == station_id]

    # Split the data into training and testing sets
    X = station_data.drop(columns=['MechanicalBikesAvailable', 'ElectricBikesAvailable', 'StandsAvailable'])
    y = station_data[['MechanicalBikesAvailable', 'ElectricBikesAvailable', 'StandsAvailable']]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a machine learning model
    model = LinearRegression()
    # model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    # Serialize the trained model into a file called model.pkl
    with open(f'model_{station_id}.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)

    # Deserialize the model.pkl file into an object called model
    with open(f'model_{station_id}.pkl', 'rb') as handle:
        model = pickle.load(handle)

    # Evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for station {station_id}:", mse)

Mean Squared Error for station 1: 0.0
Mean Squared Error for station 2: 0.0
Mean Squared Error for station 3: 0.0
Mean Squared Error for station 4: 0.0
Mean Squared Error for station 5: 0.0
Mean Squared Error for station 6: 0.0
Mean Squared Error for station 7: 0.0
Mean Squared Error for station 8: 0.0
Mean Squared Error for station 9: 0.0
Mean Squared Error for station 10: 0.0
Mean Squared Error for station 11: 0.0
