In [None]:
import os
import pandas as pd
import urllib.request
import zipfile
from datetime import datetime, timedelta
import requests


def generate_urls(start_year, end_year):

    urls = []
    for year in range(start_year, end_year+1):
        for month in range(1,13):
            url = f'https://s3.amazonaws.com/tripdata/JC-{year}{month:02d}-citibike-tripdata.csv.zip'
            urls.append(url)
    return urls

def download_and_extract_data(urls, save_path):

    final_df = pd.DataFrame()
    # Checking if the directory exists, and create it if it doesn't exist
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Iterating through the list of URLs
    for url in urls:
        # Get the file name from the URL
        file_name = url.split('/')[-1]
        try:
            # Download the file
            response = requests.get(url)
            open(file_name, 'wb').write(response.content)
            # Extract the file to the save_path directory
            with zipfile.ZipFile(file_name, 'r') as zip_ref:
                zip_ref.extractall(save_path)
                if int(url.split("-")[1][4:6]) == 1:
                    continue #I want to skip January, so i can treat it seperately
                    # read the extracted file and append it to the final_df

                temp_df = pd.read_csv(os.path.join(save_path, file_name.replace(".zip","")))
#                 final_df =pd.read_csv(os.path.join(save_path, file_name.replace(".zip","")))
                final_df = pd.concat([final_df, temp_df], axis=0, sort=False)

        except Exception as e:
            print(f'Error: {e}')
    final_df.to_csv(f"{save_path}\\annual_data.csv", index=False)
    return final_df




start_year = 2021
end_year = 2021

#  list of urls
urls = generate_urls(start_year, end_year)

save_path = r'D:\Users\Awumboro\Desktop\Projects\Coding\python\Transport Modelling'

# Download and extract the data
final_df = download_and_extract_data(urls, save_path)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# df = final_df
df = pd.read_csv(r"D:\Users\Awumboro\Desktop\Projects\Coding\python\Transport Modelling\annual_data.csv")
df = df.dropna(subset=['start_lat','start_lng','end_lat','end_lng'])
# converting started_at and ended_at to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])


In [None]:
pd.set_option('display.max_columns', None)
df.head(5)

## ANALYSES

### Spatial Analysis

In [None]:
import matplotlib.pyplot as plt

# column 'day_of_week' that contains the name of the day of the week
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_of_week'] = df['started_at'].dt.weekday.apply(lambda x: days[x])

# grouping  data by 'day_of_week' and 'member_casual' and calculating the mean number of trips
df_grouped = df.groupby(['day_of_week', 'member_casual']).size().reset_index(name='count')
df_grouped = df_grouped.groupby(['day_of_week', 'member_casual']).mean().reset_index()

# c bar chart
plt.figure(figsize=(10, 6))

# plotting the mean number of trips for members
plt.bar(df_grouped[df_grouped['member_casual'] == 'member']['day_of_week'], 
        df_grouped[df_grouped['member_casual'] == 'member']['count'], 
        color='blue', label='Members')

# plotting the mean number of trips for casual riders
plt.bar(df_grouped[df_grouped['member_casual'] == 'casual']['day_of_week'], 
        df_grouped[df_grouped['member_casual'] == 'casual']['count'], 
        color='orange', label='Casual Riders')

plt.xlabel('Day of Week')
plt.ylabel('Average Number of Trips')
plt.title('Average Number of Trips by Day of Week and Member Type')
plt.legend()

plt.show()


In [None]:
# Grouping the data by month
monthly_ridership = df.groupby(df['started_at'].dt.month)['ride_id'].count()
# months = ['Feb', 'March','April','May','Jun','Jul','Aug','Sept','Oct']

# Plotting the ridership per month
plt.figure(figsize=((10,6)))
monthly_ridership.plot(kind='bar')

plt.title("Ridership per Month")
plt.xlabel("Month")
plt.ylabel("Ridership Count")

plt.show()


# #Temporal Analysis

In [None]:
* Station Popularity analysis

In [None]:
# Top 10 most popular start stations in New York City


top_n = 15
top_stations = df['start_station_name'].value_counts().nlargest(top_n)

fig, ax = plt.subplots(figsize=(10, 10))
ax.barh(top_stations.index.to_list(), top_stations.values)
ax.set_xlabel('Counts')
ax.set_ylabel('Start Station Name')
ax.set_title('Top 15 Popular Start Stations')

plt.show()

In [None]:
# Top 10 most popular End stations in New York City
top_n = 15
top_stations = df['end_station_name'].value_counts().nlargest(top_n)

fig, ax = plt.subplots(figsize=(10, 10))
ax.barh(top_stations.index.to_list(), top_stations.values)
ax.set_title("Top {} Ending Stations".format(top_n))
ax.set_xlabel("Number of Rides")
ax.set_ylabel("Station Name")
plt.yticks(rotation=0)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extracting hour of day from start_time
df['hour_of_day'] = df['started_at'].dt.hour

df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

# Grouping by hour of day and calculate average trip duration
hourly_duration = df.groupby('hour_of_day')['trip_duration'].mean()

plt.figure(figsize=((10,6)))
plt.plot(hourly_duration)

plt.xlabel('Hour of the Day')
plt.ylabel('Average Trip Duration (minutes)')
plt.title('Average Trip Duration by Hour of the Day')

plt.show()


In [None]:
# extracting top 10 start stations
top_stations = df['start_station_name'].value_counts().nlargest(10)

# filtering the dataframe to only include the top 15 stations
df_top_stations = df[df['start_station_name'].isin(top_stations.index)]

# grouping the data by station and time period
df_top_stations = df_top_stations.groupby(['start_station_name', pd.Grouper(key='started_at', freq='D')]).count()

# plotting the number of rides over time for each station
plt.figure(figsize=(15 ,8))
for station in top_stations.index:
    plt.plot(df_top_stations.loc[station]['ride_id'], label=station)
    
plt.xlabel('Time')
plt.ylabel('Number of Rides')
plt.legend(loc='best')
plt.title('Rides over Time for Top 10 Stations')
plt.show()


In [None]:
df['started_at'] = pd.to_datetime(df['started_at'])

# Grouping data by hour and count number of rides
rides_by_hour = df.groupby(df['started_at'].dt.hour).size()

# Plot count of rides versus hour of the day
plt.figure(figsize=(10,6))
plt.bar(rides_by_hour.index, rides_by_hour.values)

# Set x-axis tick labels to hour labels
plt.xticks(rides_by_hour.index, [f'{hour}:00' for hour in rides_by_hour.index],  rotation=45, ha='right')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Rides')
plt.title('Rides vs Hour of the Day')
plt.show()


In [None]:
# grouping data by day of the week and count number of rides
rides_by_day = df.groupby(df['started_at'].dt.dayofweek).size()
# df['day_of_week'] = df['started_at'].dt.day_name()
days_of_week = ['Mon','Tue',"Wed", 'Thurs','Fri','Sat','Sun']

# plotting count of rides versus day of the week
plt.figure(figsize=((10,6)))
rides_by_day.plot(kind='bar')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Rides')
plt.title('Rides vs Day of the Week')
plt.xticks(rides_by_day.index, days_of_week, rotation=45)
plt.show()



In [None]:
# grouping data by hour and count number of rides started and ended
rides_start = df.groupby(df['started_at'].dt.hour).size()
rides_end = df.groupby(df['ended_at'].dt.hour).size()

# stacked bar chart
plt.figure(figsize=((10,6)))
plt.bar(rides_start.index, rides_start.values, label='Rides Started')
plt.bar(rides_end.index, rides_end.values, bottom=rides_start.values, label='Rides Ended')

# Setting x-axis tick labels to hour labels
plt.xticks(rides_start.index, [f'{hour}:00' for hour in rides_start.index], rotation=45)
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Rides')
plt.title('Rides Started and Ended by Hour of the Day')
plt.legend()
plt.show()

In [None]:
# Areas with  ride acitivities
import folium
from folium.plugins import FastMarkerCluster

# list of lists with lat, lon and additional information
coords = df[['start_lat', 'start_lng', 'member_casual']].apply(lambda x: [x['start_lat'], x['start_lng'], x['member_casual']], axis=1).tolist()

# creating a map
m = folium.Map(location=[df['start_lat'].mean(), df['start_lng'].mean()], zoom_start=13)

# creating marker cluster
marker_cluster = FastMarkerCluster(coords, overlay=lambda x: len(x)).add_to(m)

# displaying the map below
m


In [None]:
import folium
from folium.plugins import HeatMap

# new DataFrame with the start and end coordinates
routes = df[['start_lat', 'start_lng', 'end_lat', 'end_lng']]

# grouping the start and end coordinates by the most commonly used routes
routes_counts = routes.groupby(['start_lat', 'start_lng', 'end_lat', 'end_lng']).size().reset_index(name='counts')

# creating a list of lists with lat, lon, and counts for the heatmap
heatmap_data = routes_counts[['start_lat', 'start_lng', 'counts']].values.tolist()

# creating map
m = folium.Map(location=[df['start_lat'].mean(), df['start_lng'].mean()], zoom_start=13)

# adding heatmap to the map
HeatMap(heatmap_data, name='Routes', overlay=True).add_to(m)

# displaying my map
m


In [None]:
import folium
from folium.plugins import HeatMap


m = folium.Map(location=[df['start_lat'].mean(), df['start_lng'].mean()], zoom_start=11)

# Grouping the data by start and end coordinates
grouped_df = df.groupby(['start_lat', 'start_lng', 'end_lat', 'end_lng']).size().reset_index(name='counts')

# Iterating through the grouped data and adding polylines to the map
for i, row in grouped_df.iterrows():
    folium.PolyLine([[row['start_lat'], row['start_lng']], [row['end_lat'], row['end_lng']]], 
                    color='red', weight=row['counts']/100, opacity=0.6).add_to(m)

# displaying my map
m
