In [None]:
# Install the pygeohash library, which allows encoding and decoding of geohashes (used for spatial indexing and grouping)
!pip install pygeohash

In [None]:
# Import Required Libraries
# os is used for handling file paths and directory operations
import os
# pandas is used for data manipulation and analysis using DataFrames
import pandas as pd
# numpy provides support for numerical operations and array handling
import numpy as np
# geopandas extends pandas to handle geospatial data using shapefiles, geometry columns, etc.
import geopandas as gpd
# DBSCAN is a clustering algorithm from scikit-learn used for density-based clustering
from sklearn.cluster import KMeans, DBSCAN
# MinMaxScaler is used to scale features to a specified range, typically [0, 1]
from sklearn.preprocessing import MinMaxScaler
# mean_absolute_error is used to evaluate prediction performance by calculating average absolute errors
from sklearn.metrics import mean_absolute_error
# great_circle is used to calculate the shortest distance between two points on the Earth's surface
from geopy.distance import great_circle
# pygeohash is used for encoding latitude and longitude into geohash strings for spatial grouping
import pygeohash as pgh
# folium is used for interactive map visualization in Python
import folium
# MarkerCluster allows grouping markers in folium maps for better readability
from folium.plugins import MarkerCluster
# matplotlib is a standard plotting library for creating static, animated, and interactive visualizations
import matplotlib.pyplot as plt
# seaborn is a statistical data visualization library based on matplotlib, providing enhanced plots
import seaborn as sns
# shapely.geometry.Point is used to create point objects for spatial operations
from shapely.geometry import Point
# nearest_points is used to find the nearest point between two geometries
from shapely.ops import nearest_points
# google.colab.drive is used to mount Google Drive in Colab to access files stored there
from google.colab import drive
# Calculates the Silhouette Score, which is used to evaluate the quality of clustering results. A higher score indicates better-defined clusters.
from sklearn.metrics import silhouette_score
# Splits the dataset into training and testing subsets. This is typically used in machine learning to evaluate model performance by training on one set and testing on another.
from sklearn.model_selection import train_test_split
# Import folium for interactive map visualization of spatial data
import folium

In [None]:
# Step 1: Mount Drive & Load AQ and Taxi Data
# Mount Google Drive to access files stored in the Drive
drive.mount('/content/drive')
# Set the path to the directory where the datasets are stored
base_path = '/content/drive/MyDrive/Datasets/'

# Initialize an empty list to collect data from each AQ CSV file
aq_dfs = []
# Loop through AQ_data_1.csv to AQ_data_19.csv to read each file
for i in range(1, 20):
    # Construct the file path for each AQ dataset
    file_path = os.path.join(base_path, f"AQ_data_{i}.csv")

    # Check if the file exists at the specified path
    if os.path.exists(file_path):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Rename the necessary columns for standardization
        df = df.rename(columns={
            'Latitude': 'latitude',
            'Longitude': 'longitude',
            'ReadingDateTimeUTC': 'timestamp',
            'PM25': 'pm25'
        })

        # Keep only the relevant columns needed for analysis
        df = df[['latitude', 'longitude', 'timestamp', 'pm25']]

        # Print how many records were loaded from the current file
        print(f" Loaded AQ_data_{i}.csv with {len(df)} records")

        # Append the DataFrame to the AQ list
        aq_dfs.append(df)
    else:
        # If the file doesn't exist, print a message
        print(f" File not found: {file_path}")

# Combine all individual AQ DataFrames into a single DataFrame
aq_df = pd.concat(aq_dfs, ignore_index=True)

# Display a preview of the first few records of the combined AQ data
print(" Preview of Combined AQ Data:")
print(aq_df.head())

# Print the total number of AQ records loaded
print(f" Total AQ Records Loaded: {len(aq_df)}")

# Load All Taxi Trips Data
# Construct the full file path to the Taxi_Trips dataset
taxi_file_path = os.path.join(base_path, 'Taxi_Trips.csv')

# Load selected columns from the taxi dataset for memory efficiency
taxi_df = pd.read_csv(taxi_file_path, usecols=[
    'Trip Start Timestamp',
    'Trip Seconds',
    'Trip Miles',
    'Pickup Centroid Latitude',
    'Pickup Centroid Longitude'
])

# Rename the columns for consistency with other datasets
taxi_df = taxi_df.rename(columns={
    'Trip Start Timestamp': 'trip_start_timestamp',
    'Pickup Centroid Latitude': 'pickup_latitude',
    'Pickup Centroid Longitude': 'pickup_longitude',
    'Trip Seconds': 'trip_seconds',
    'Trip Miles': 'trip_miles'
})

# Display a preview of the taxi data
print("\n Preview of Taxi Trips Data:")
print(taxi_df.head())

# Print the total number of taxi trip records loaded
print(f" Total Taxi Records Loaded: {len(taxi_df)}")


In [None]:
# Step 2: Convert Timestamps & Resample by 5-Hour Windows
# Convert AQ 'timestamp' column from string format to pandas datetime format
aq_df['timestamp'] = pd.to_datetime(aq_df['timestamp'])

# Print a few converted AQ timestamps to verify the format
print("AQ timestamps converted:")
print(aq_df[['timestamp']].head())

# Convert Taxi 'trip_start_timestamp' column from string to datetime format
taxi_df['trip_start_timestamp'] = pd.to_datetime(taxi_df['trip_start_timestamp'])

# Print a few converted Taxi timestamps to verify the format
print("\nTaxi trip timestamps converted:")
print(taxi_df[['trip_start_timestamp']].head())

# Round each AQ timestamp down to the nearest 5-hour interval (e.g., 00:00, 05:00, 10:00...)
aq_df['time_block'] = aq_df['timestamp'].dt.floor('5H')

# Show the original and new 5-hour rounded timestamps for AQ data
print("\nAQ data after assigning to 5-hour time blocks:")
print(aq_df[['timestamp', 'time_block']].head())

# Round each Taxi trip timestamp to the nearest 5-hour block
taxi_df['time_block'] = taxi_df['trip_start_timestamp'].dt.floor('5H')

# Show original and new 5-hour rounded timestamps for Taxi data
print("\nTaxi data after assigning to 5-hour time blocks:")
print(taxi_df[['trip_start_timestamp', 'time_block']].head())

In [None]:
# Step 3: Geohash Grouping and Feature Aggregation
# Define a helper function to compute geohashes based on latitude and longitude
def geohash_group(df, lat_col, lon_col, precision=5):
    """
    Adds a 'geohash' column to the DataFrame based on latitude and longitude.

    Args:
        df (pd.DataFrame): Input DataFrame
        lat_col (str): Name of the latitude column
        lon_col (str): Name of the longitude column
        precision (int): Geohash precision level (higher = smaller area)

    Returns:
        pd.DataFrame: DataFrame with 'geohash' column added
    """
    # Apply geohash encoding to each row using specified lat/lon columns
    df['geohash'] = df.apply(lambda row: pgh.encode(row[lat_col], row[lon_col], precision=precision), axis=1)
    return df

# Apply geohash encoding to AQ data
aq_df = geohash_group(aq_df, 'latitude', 'longitude')

# Apply geohash encoding to Taxi data
taxi_df = geohash_group(taxi_df, 'pickup_latitude', 'pickup_longitude')

# Print a sample of AQ geohash assignments
print(" Sample AQ geohashes:")
print(aq_df[['latitude', 'longitude', 'geohash']].head())

# Print a sample of Taxi geohash assignments
print("\n Sample Taxi geohashes:")
print(taxi_df[['pickup_latitude', 'pickup_longitude', 'geohash']].head())

# Aggregate taxi data: count number of trips per geohash and time block (trip density)
taxi_density = taxi_df.groupby(['geohash', 'time_block']).size().reset_index(name='taxi_density')
print("\n Taxi density (trips per geohash and time block):")
print(taxi_density.head())

# Aggregate AQ data: calculate average PM2.5 per geohash and time block
aq_avg = aq_df.groupby(['geohash', 'time_block'])['pm25'].mean().reset_index(name='pm25')
print("\n Average PM2.5 per geohash and time block:")
print(aq_avg.head())

# Merge the two datasets on common geohash and time block to align spatial-temporal features
features_df = pd.merge(taxi_density, aq_avg, on=['geohash', 'time_block'])

# Print the merged features combining taxi activity and air quality
print("\n Merged Features DataFrame (Taxi + AQ):")
print(features_df.head())

# Show total number of records in the final dataset
print(f" Total Records in Feature Set: {len(features_df)}")