In [1]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import dbManager  # Assuming dbManager is properly set up for using SQLAlchemy

Failed to establish connection: (mysql.connector.errors.ProgrammingError) 1045 (28000): Access denied for user 'admin'@'137.43.122.134' (using password: YES)
(Background on this error at: https://sqlalche.me/e/20/f405)


In [2]:
load_dotenv('../../db.env')

DB_PASSWORD = os.getenv("DB_PASSWORD")
URI = 'dublinbikes.clw8uqmac8qf.eu-west-1.rds.amazonaws.com'
PORT = 3306
USER = 'admin'
DB = 'dbikes'

# Connect to the db
connection_string = f"mysql+mysqlconnector://{USER}:{DB_PASSWORD}@{URI}:{PORT}/{DB}"
engine = create_engine(connection_string) #, echo=True

# Testing
try:
    connection = engine.connect()
    print("Connection established successfully.")

except Exception as e:
    print("Failed to establish connection:", e)

Connection established successfully.


In [3]:
import pandas as pd

def get_station_data(number):
    query = f"SELECT * FROM availability WHERE number = {number}"
    # Execute the query and fetch results directly into a DataFrame
    df = pd.read_sql(query, engine)
    return df

In [4]:
def get_weather_data():
    query = "SELECT * FROM currentweather"
    df = pd.read_sql(query, engine)
    return df

In [5]:
def create_weather_df():
    weather_df = get_weather_data()
    weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'], unit='s')
    # Create a merge key that includes up to the minute
    weather_df['merge_key'] = weather_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
    return weather_df

def availability_df(station_number):
    availability_df = get_station_data(station_number)
    availability_df['timestamp'] = pd.to_datetime(availability_df['timestamp'], unit='s')
    # Create a merge key that includes up to the minute
    availability_df['merge_key'] = availability_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
    return availability_df


def merge_dfs(weather_df, station_number):
    availability = availability_df(station_number)
    # Merge on the new merge_key
    merged_df = pd.merge(weather_df, availability, on='merge_key', how='inner')
    # Optionally, convert merge_key back to datetime for further time-based analysis
    merged_df['timestamp'] = pd.to_datetime(merged_df['merge_key'])
    return merged_df

In [6]:
def count_stations():
    query = "SELECT COUNT(DISTINCT number) as Total FROM availability"
    df = pd.read_sql(query, engine)
    return int(df['Total'][0])

In [7]:
def minimised_df(station_number,target,mode=0,dummies=False):
    """Creates a dataframe with only the essential features for predictions"""
    minimised_df = merge_dfs(create_weather_df(), station_number)

    #Create new features
    minimised_df['day_of_week'] = minimised_df['timestamp'].dt.dayofweek+1
    minimised_df['hour'] = minimised_df['timestamp'].dt.hour
    minimised_df['minute'] = minimised_df['timestamp'].dt.minute

    #Drop features redundant for predictions
    unnecessary_features = ['id','timestamp','timestamp_y','lastUpdate','electricalRemovableBatteryBikes','electricalInternalBatteryBikes',
                              'electricalBikes','mechanicalBikes','status','timestamp_x','merge_key','number','availability_id','description']

    #If predicting no. of bikes, we don't need to know no. of stands and vice versa
    if target == 'bikes':
        if mode==1:
            minimised_df.loc[list(minimised_df[minimised_df['bikes']>0].index),'bikes'] = 1
        unnecessary_features.append('stands')
    else:
        if mode==1:
            minimised_df.loc[list(minimised_df[minimised_df['stands']>0].index),'stands'] = 1
        unnecessary_features.append('bikes')

    #If categorical features necessary
    if dummies==True:
        dummie_vals = pd.get_dummies(minimised_df['description'])
        minimised_df = pd.concat([minimised_df, dummie_vals], axis=1)
        minimised_df = minimised_df.replace({True: 1, False: 0})
        
    minimised_df = minimised_df.drop(labels=unnecessary_features, axis=1)

    #Change to correct datatypes
    categorical_columns = ['day_of_week','hour','minute']

    if mode==1:
        categorical_columns.append(target)

    if dummies==True:
        categorical_columns.append(dummie_vals.columns)
    
    for column in categorical_columns:
        minimised_df[column] = minimised_df[column].astype('category') 

    #Try drop duplicates
    if minimised_df.duplicated().sum() > 0:
        minimised_df = minimised_df.drop(minimised_df[minimised_df.duplicated()].index)

    #Return the finalised dataset
    return minimised_df

In [24]:
df_1 = minimised_df(1,'bikes',1)
df_0 = minimised_df(1,'bikes')
df_dummies = minimised_df(1,'bikes',0,True)

In [25]:
df_1

Unnamed: 0,temperature,wind_speed,rainfall,bikes,day_of_week,hour,minute
0,8.81,3.09,0.0,1,3,17,25
1,10.78,3.09,0.0,1,3,17,30
2,10.68,3.09,0.0,1,3,17,35
3,11.32,3.09,0.0,1,3,17,40
4,11.32,3.09,0.0,1,3,17,45
...,...,...,...,...,...,...,...
11729,11.93,2.06,0.0,1,3,15,25
11730,11.60,2.06,0.0,1,3,15,30
11731,12.21,2.06,0.0,1,3,15,35
11732,12.17,2.06,0.0,1,3,15,40


In [26]:
df_0

Unnamed: 0,temperature,wind_speed,rainfall,bikes,day_of_week,hour,minute
0,8.81,3.09,0.0,19,3,17,25
1,10.78,3.09,0.0,18,3,17,30
2,10.68,3.09,0.0,20,3,17,35
3,11.32,3.09,0.0,20,3,17,40
4,11.32,3.09,0.0,20,3,17,45
...,...,...,...,...,...,...,...
11729,11.93,2.06,0.0,1,3,15,25
11730,11.60,2.06,0.0,3,3,15,30
11731,12.21,2.06,0.0,3,3,15,35
11732,12.17,2.06,0.0,2,3,15,40


In [27]:
df_dummies

Unnamed: 0,temperature,wind_speed,rainfall,bikes,day_of_week,hour,minute,broken clouds,clear sky,few clouds,fog,haze,heavy intensity rain,light rain,mist,moderate rain,overcast clouds,scattered clouds,thunderstorm with light rain
0,8.81,3.09,0.0,19,3,17,25,0,1,0,0,0,0,0,0,0,0,0,0
1,10.78,3.09,0.0,18,3,17,30,0,1,0,0,0,0,0,0,0,0,0,0
2,10.68,3.09,0.0,20,3,17,35,0,1,0,0,0,0,0,0,0,0,0,0
3,11.32,3.09,0.0,20,3,17,40,0,1,0,0,0,0,0,0,0,0,0,0
4,11.32,3.09,0.0,20,3,17,45,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11729,11.93,2.06,0.0,1,3,15,25,0,1,0,0,0,0,0,0,0,0,0,0
11730,11.60,2.06,0.0,3,3,15,30,0,1,0,0,0,0,0,0,0,0,0,0
11731,12.21,2.06,0.0,3,3,15,35,0,1,0,0,0,0,0,0,0,0,0,0
11732,12.17,2.06,0.0,2,3,15,40,0,1,0,0,0,0,0,0,0,0,0,0


#### IMPORTANT
- IMPLEMENT SOME KIND OF ERROR HANDLING FOR PREDICTIONS BETWEEN HOUR = 0 AND 5
- IS THE MODEL BETTER WHEN WE INCLUDE 'description'

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet, LassoCV, ElasticNetCV, Ridge
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [9]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [184]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [221]:
def basic_classification_model_test(station_number,target,dummies=False):
    df = minimised_df(station_number,target,1,dummies)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    accuracy, precision, recall = {}, {}, {}

    for key in models.keys():
        
        # Fit the classifier
        models[key].fit(X_train, y_train)
        
        # Make predictions
        predictions = models[key].predict(X_test)
        
        # Calculate metrics
        accuracy[key] = metrics.accuracy_score(predictions, y_test)
        precision[key] = metrics.precision_score(predictions, y_test)
        recall[key] = metrics.recall_score(predictions, y_test)

    df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
    df_model['Accuracy'] = accuracy.values()
    df_model['Precision'] = precision.values()
    df_model['Recall'] = recall.values()
    
    return df_model

In [207]:
basic_classification_model_test(1,'bikes')



Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.926124,0.999039,0.926884
Support Vector Machines,0.926569,1.0,0.926536
Decision Trees,0.970628,0.980778,0.987421
Random Forest,0.979528,0.992792,0.985217
Naive Bayes,0.910547,0.98174,0.926111
K-Nearest Neighbor,0.923008,0.984623,0.935616


In [222]:
basic_classification_model_test(1,'bikes',True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AttributeError: 'Flags' object has no attribute 'c_contiguous'

In [263]:
def basic_classification_model(station_number,target):
    df = minimised_df(station_number,target,1)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    #scaler = MinMaxScaler()
    #X_train_normalized = scaler.fit_transform(X_train)
    #X_test_normalized = scaler.transform(X_test)

    accuracy, precision, recall = {}, {}, {}
        
    # Fit the classifier

    model = RandomForestClassifier()
    
    model.fit(X_train, y_train)
        
    # Make predictions
    predictions = model.predict(X_test)

    actual_vs_predicted = pd.concat([y_test, pd.DataFrame(predictions, columns=['Predicted'],index=y_test.index)], axis=1)

    printMetrics(y_test, predictions)
    
    return actual_vs_predicted

In [264]:
basic_classification_model(1,'bikes')


MAE:  0.020462633451957295
RMSE:  0.14304766146972586
R2:  0.688671718585824


Unnamed: 0,bikes,Predicted
599,1,1
223,1,1
6649,1,1
4124,1,1
8362,1,1
...,...,...
4145,1,1
4701,1,1
5086,0,0
170,1,1


In [227]:
models = {}

from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet, LassoCV, ElasticNetCV, Ridge
models['Linear Regression'] = LinearRegression()
models['Ridge CV'] = RidgeCV()
models['Lasso'] = Lasso()
models['Elastic Net'] = ElasticNet()
models['Lasso CV'] = LassoCV()
models['Elastic Net CV'] = ElasticNetCV()
models['Ridge'] = Ridge()

In [228]:
def basic_prediction_model_test(station_number,target,dummies=False):
    df = minimised_df(station_number,target,0,dummies)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    scaler = MinMaxScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)

    mae, rmse, r2 = {}, {}, {}

    for key in models.keys():
        
        # Fit the classifier
        models[key].fit(X_train_normalized, y_train)
        
        # Make predictions
        predictions = models[key].predict(X_test_normalized)
        
        # Calculate metrics
        mae[key] = metrics.mean_absolute_error(predictions, y_test)
        rmse[key] = metrics.mean_squared_error(predictions, y_test)**0.5
        r2[key] = metrics.r2_score(predictions, y_test)


    df_model = pd.DataFrame(index=models.keys(), columns=['MAE', 'RMSE', 'R2'])
    df_model['MAE'] = mae.values()
    df_model['RMSE'] = rmse.values()
    df_model['R2'] = r2.values()
    
    return df_model

In [229]:
basic_prediction_model_test(1,'bikes')

Unnamed: 0,MAE,RMSE,R2
Linear Regression,8.066629,9.236401,-11.92897
Ridge CV,8.067073,9.236456,-12.01414
Lasso,8.337409,9.562094,-2.8976480000000004e+31
Elastic Net,8.337409,9.562094,-2.8976480000000004e+31
Lasso CV,8.066787,9.236412,-11.97038
Elastic Net CV,8.068324,9.236898,-12.25788
Ridge,8.067073,9.236456,-12.01414


In [230]:
basic_prediction_model_test(1,'bikes',True)

Unnamed: 0,MAE,RMSE,R2
Linear Regression,8.00408,9.202537,-10.08723
Ridge CV,8.00462,9.202801,-10.1155
Lasso,8.337409,9.562094,-2.8976480000000004e+31
Elastic Net,8.337409,9.562094,-2.8976480000000004e+31
Lasso CV,8.006114,9.203124,-10.20264
Elastic Net CV,8.016763,9.209254,-10.76018
Ridge,8.008853,9.204837,-10.31482


In [254]:
def basic_prediction_model(station_number,target,dummies=False):
    df = minimised_df(station_number,target,0,dummies)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    scaler = MinMaxScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)
        
    # Fit the classifier

    model = RidgeCV()
    
    model.fit(X_train_normalized, y_train)
        
    # Make predictions
    predictions = model.predict(X_test_normalized)

    actual_vs_predicted = pd.concat([y_test, pd.DataFrame([round(x) for x in predictions], columns=['Predicted'],index=y_test.index)], axis=1)

    printMetrics(y_test, predictions)
    
    return actual_vs_predicted

In [255]:
basic_prediction_model(1,'bikes')


MAE:  7.944397068270482
RMSE:  9.101673223819871
R2:  0.08533419188745972


Unnamed: 0,bikes,Predicted
674,14,18
7150,18,15
625,14,16
4139,26,13
10292,3,12
...,...,...
5967,27,18
6457,18,13
5113,0,8
161,28,16


In [256]:
basic_prediction_model(1,'bikes',True)


MAE:  7.861135149852997
RMSE:  9.039114101953633
R2:  0.09786464403887551


Unnamed: 0,bikes,Predicted
674,14,17
7150,18,14
625,14,16
4139,26,13
10292,3,14
...,...,...
5967,27,17
6457,18,14
5113,0,7
161,28,15


In [10]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [11]:
adv_models = {}

adv_models['HistGradientBoostingRegressor'] = HistGradientBoostingRegressor()
adv_models['GradientBoostingRegressor'] = GradientBoostingRegressor()
adv_models['RandomForestRegressor'] = RandomForestRegressor()
adv_models['KNeighborsRegressor'] = KNeighborsRegressor()
adv_models['StackingRegressor'] = StackingRegressor(estimators = [('ridge', RidgeCV()),('lasso', LassoCV(random_state=42)),
                                                                  ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))])
adv_models['VotingRegressor'] = VotingRegressor(estimators=[('gb', adv_models['HistGradientBoostingRegressor']),
                                                            ('rf', adv_models['GradientBoostingRegressor']),
                                                            ('lr', adv_models['RandomForestRegressor'])])

In [28]:
def adv_prediction_model_test(station_number,target,dummies=False):
    df = minimised_df(station_number,target,0,dummies)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    scaler = MinMaxScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)

    mae, rmse, r2 = {}, {}, {}

    for key in adv_models.keys():
        
        # Fit the classifier
        adv_models[key].fit(X_train_normalized, y_train)
        
        # Make predictions
        predictions = adv_models[key].predict(X_test_normalized)
        
        # Calculate metrics
        mae[key] = metrics.mean_absolute_error(predictions, y_test)
        rmse[key] = metrics.mean_squared_error(predictions, y_test)**0.5
        r2[key] = metrics.r2_score(predictions, y_test)


    df_model = pd.DataFrame(index=adv_models.keys(), columns=['MAE', 'RMSE', 'R2'])
    df_model['MAE'] = mae.values()
    df_model['RMSE'] = rmse.values()
    df_model['R2'] = r2.values()
    
    return df_model

In [29]:
adv_prediction_model_test(1,'bikes')

Unnamed: 0,MAE,RMSE,R2
HistGradientBoostingRegressor,3.875408,5.019191,0.514013
GradientBoostingRegressor,5.586173,6.806176,-0.350998
RandomForestRegressor,1.517412,2.784411,0.900774
KNeighborsRegressor,4.819731,6.385349,0.238976
StackingRegressor,5.388394,6.777051,-0.025784
VotingRegressor,3.540079,4.48983,0.583613


In [69]:
def advanced_prediction_model(station_number,target,dummies=False):
    df = minimised_df(station_number,target,0,dummies)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    # Fit the classifier

    model3 = Pipeline([ ('feature_scaler', StandardScaler()),
                  ('feature_selection', SelectFromModel(RidgeCV())), 
                   ('fpoly', PolynomialFeatures(degree=3)),
                   ('svr', RidgeCV())
                   ])

    model2 = Pipeline([ ('feature_scaler', StandardScaler()),
                  #('feature_selection', SelectFromModel(RidgeCV())), 
                   #('fpoly', PolynomialFeatures(degree=3)),
                   ('svr', SVR(kernel='rbf'))
                   ])

    model = Pipeline([ #('feature_scaler', StandardScaler()),
                   #('learner', RandomForestRegressor())
                ('learner', KNeighborsRegressor(n_neighbors=8,algorithm="ball_tree",p=3,n_jobs=-1))
                   ])
    
    model.fit(X_train, y_train)
        
    # Make predictions
    predictions = model.predict(X_test)

    actual_vs_predicted = pd.concat([y_test, pd.DataFrame([round(x) for x in predictions], columns=['Predicted'],index=y_test.index)], axis=1)

    printMetrics(y_test, predictions)
    
    return actual_vs_predicted

In [70]:
advanced_prediction_model(1,'bikes',True)


MAE:  6.733891213389121
RMSE:  8.069327288382178
R2:  0.2889218950100567


Unnamed: 0,bikes,Predicted
8031,20,9
7490,25,21
11586,0,6
8464,1,12
479,7,6
...,...,...
4837,14,6
1684,14,3
10865,2,14
5898,22,22


In [16]:
df = minimised_df(1,"Bikes",0,True)

In [17]:
def generate_models():
    number_of_stations = count_stations()

    for i in range(1,number_of_stations+1):
        advanced_prediction_model(i,'bikes',True)

In [23]:
import pickle

def generate_models(target):
    """Options for target: "bikes" or "stands"""""
    number_of_stations = count_stations()

    for i in range(1,number_of_stations+1):
        df = minimised_df(i,target,0,True)

        if not df.empty:
            features = list(df.columns)
            features.remove(target)
    
            X = df[features]
            y = df[target]
        
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
            # Fit the classifier
        
            model = Pipeline([('learner', RandomForestRegressor())])
            
            model.fit(X_train, y_train)
    
            with open(f'pickle_files/{target}_{i}.pkl', 'wb') as handle:
                pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)

    print("Models have been serialised")

In [24]:
generate_models("bikes")

Models have been serialised


In [35]:
def check_feature_significance(station_number,target):
    df = minimised_df(station_number,target,0,True)

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    # Fit the classifier

    model = RandomForestRegressor()
    
    model.fit(X_train, y_train)
        
    # Make predictions
    predictions = model.predict(X_test)

    important_features_dict = {}
    for idx, val in enumerate(model.feature_importances_):
        important_features_dict[idx] = val
    
    important_features_list = sorted(important_features_dict,
                                     key=important_features_dict.get,
                                     reverse=True)

    print("Features ordered by significance:")
    for i in range(len(important_features_list)):
        print(features[important_features_list[i]],", corelation with output:",important_features_dict[important_features_list[i]])

In [36]:
check_feature_significance(1,"bikes")

Features ordered by significance:
temperature , corelation with output: 0.30757167733753227
day_of_week , corelation with output: 0.25658810792138426
hour , corelation with output: 0.17278403917957064
wind_speed , corelation with output: 0.14155436570477853
clear sky , corelation with output: 0.04193217365647946
minute , corelation with output: 0.02086720247857264
overcast clouds , corelation with output: 0.014496312105305892
few clouds , corelation with output: 0.013879351960026048
broken clouds , corelation with output: 0.01366036291985761
scattered clouds , corelation with output: 0.008545489936281405
rainfall , corelation with output: 0.0034357953424678754
haze , corelation with output: 0.002655123756397809
mist , corelation with output: 0.0011821059160204042
light rain , corelation with output: 0.000437695404156342
moderate rain , corelation with output: 0.00038041127678560083
heavy intensity rain , corelation with output: 1.845862237809948e-05
fog , corelation with output: 1.1257