In [39]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import dbManager  # Assuming dbManager is properly set up for using SQLAlchemy


In [40]:
load_dotenv('.env')

DB_PASSWORD = os.getenv("DB_PASSWORD")
URI = 'dublinbikes.clw8uqmac8qf.eu-west-1.rds.amazonaws.com'
PORT = 3306
USER = 'admin'
DB = 'dbikes'

# Connect to the db
connection_string = f"mysql+mysqlconnector://{USER}:{DB_PASSWORD}@{URI}:{PORT}/{DB}"
engine = create_engine(connection_string, echo=True)


# Testing
try:
    connection = engine.connect()
    print("Connection established successfully.")

except Exception as e:
    print("Failed to establish connection:", e)


2024-04-05 15:47:11,059 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2024-04-05 15:47:11,061 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:47:11,226 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2024-04-05 15:47:11,228 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:47:11,295 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2024-04-05 15:47:11,297 INFO sqlalchemy.engine.Engine [raw sql] {}
Connection established successfully.


In [41]:
import pandas as pd

def get_station_data():
    query = f"SELECT * FROM availability"
    df = pd.read_sql(query, engine)
    return df


In [42]:
def get_weather_data():
    query = "SELECT * FROM currentweather"
    df = pd.read_sql(query, engine)
    return df


In [43]:
def create_weather_df():
    weather_df = get_weather_data()
    weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'], unit='s')
    # Create a merge key that includes up to the minute
    weather_df['merge_key'] = weather_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
    return weather_df

def availability_df():
    availability_df = get_station_data()
    availability_df['timestamp'] = pd.to_datetime(availability_df['timestamp'], unit='s')
    # Create a merge key that includes up to the minute
    availability_df['merge_key'] = availability_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
    return availability_df


def merge_dfs(weather_df):
    availability = availability_df()
    # Merge on the new merge_key
    merged_df = pd.merge(weather_df, availability, on='merge_key', how='inner')
    # Optionally, convert merge_key back to datetime for further time-based analysis
    merged_df['timestamp'] = pd.to_datetime(merged_df['merge_key'])
    return merged_df


In [44]:
merged_df = merge_dfs(create_weather_df())

2024-04-05 15:47:11,763 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-04-05 15:47:11,764 INFO sqlalchemy.engine.Engine DESCRIBE `dbikes`.`SELECT * FROM currentweather`
2024-04-05 15:47:11,764 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:47:11,841 INFO sqlalchemy.engine.Engine SELECT * FROM currentweather
2024-04-05 15:47:11,842 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:47:12,421 INFO sqlalchemy.engine.Engine ROLLBACK
2024-04-05 15:47:12,516 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-04-05 15:47:12,516 INFO sqlalchemy.engine.Engine DESCRIBE `dbikes`.`SELECT * FROM availability`
2024-04-05 15:47:12,517 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:47:12,644 INFO sqlalchemy.engine.Engine SELECT * FROM availability
2024-04-05 15:47:12,644 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-04-05 15:48:33,402 INFO sqlalchemy.engine.Engine ROLLBACK


In [45]:
# Print the total number of rows in merged_df
print(f"Total number of rows in merged_df: {merged_df.shape[0]}")

# Print the number of rows for each station
rows_per_station = merged_df.groupby('number').size()
print("\nNumber of rows for each station:")

for number, count in rows_per_station.items():
    print(f"Station {number}: {count} rows")


Total number of rows in merged_df: 1199508

Number of rows for each station:
Station 1: 10522 rows
Station 2: 10522 rows
Station 3: 10522 rows
Station 4: 10522 rows
Station 5: 10522 rows
Station 6: 10522 rows
Station 7: 10522 rows
Station 8: 10522 rows
Station 9: 10522 rows
Station 10: 10522 rows
Station 11: 10522 rows
Station 12: 10522 rows
Station 13: 10522 rows
Station 14: 10522 rows
Station 15: 10522 rows
Station 16: 10522 rows
Station 17: 10522 rows
Station 18: 10522 rows
Station 19: 10522 rows
Station 20: 10522 rows
Station 21: 10522 rows
Station 22: 10522 rows
Station 23: 10522 rows
Station 24: 10522 rows
Station 25: 10522 rows
Station 26: 10522 rows
Station 27: 10522 rows
Station 28: 10522 rows
Station 29: 10522 rows
Station 30: 10522 rows
Station 31: 10522 rows
Station 32: 10522 rows
Station 33: 10522 rows
Station 34: 10522 rows
Station 35: 10522 rows
Station 36: 10522 rows
Station 37: 10522 rows
Station 38: 10522 rows
Station 39: 10522 rows
Station 40: 10522 rows
Station 41: 

In [46]:
# Feature Engineering: Add day of the week and hour as features
merged_df['day_of_week'] = merged_df['timestamp'].dt.dayofweek
merged_df['hour'] = merged_df['timestamp'].dt.hour

# Select features and target for the model
features = ['temperature', 'rainfall', 'day_of_week', 'hour']  # Example feature set
target = 'bikes'  # Target variable


In [47]:
from sklearn.model_selection import train_test_split

# Assuming no missing values or categorical variables for simplicity
X = merged_df[features]
y = merged_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
