In [None]:
# Install the pygeohash library, which allows encoding and decoding of geohashes (used for spatial indexing and grouping)
!pip install pygeohash

In [None]:
# Import Required Libraries
# os is used for handling file paths and directory operations
import os
# pandas is used for data manipulation and analysis using DataFrames
import pandas as pd
# numpy provides support for numerical operations and array handling
import numpy as np
# geopandas extends pandas to handle geospatial data using shapefiles, geometry columns, etc.
import geopandas as gpd
# DBSCAN is a clustering algorithm from scikit-learn used for density-based clustering
from sklearn.cluster import KMeans, DBSCAN
# MinMaxScaler is used to scale features to a specified range, typically [0, 1]
from sklearn.preprocessing import MinMaxScaler
# mean_absolute_error is used to evaluate prediction performance by calculating average absolute errors
from sklearn.metrics import mean_absolute_error
# great_circle is used to calculate the shortest distance between two points on the Earth's surface
from geopy.distance import great_circle
# pygeohash is used for encoding latitude and longitude into geohash strings for spatial grouping
import pygeohash as pgh
# folium is used for interactive map visualization in Python
import folium
# MarkerCluster allows grouping markers in folium maps for better readability
from folium.plugins import MarkerCluster
# matplotlib is a standard plotting library for creating static, animated, and interactive visualizations
import matplotlib.pyplot as plt
# seaborn is a statistical data visualization library based on matplotlib, providing enhanced plots
import seaborn as sns
# shapely.geometry.Point is used to create point objects for spatial operations
from shapely.geometry import Point
# nearest_points is used to find the nearest point between two geometries
from shapely.ops import nearest_points
# google.colab.drive is used to mount Google Drive in Colab to access files stored there
from google.colab import drive
# Calculates the Silhouette Score, which is used to evaluate the quality of clustering results. A higher score indicates better-defined clusters.
from sklearn.metrics import silhouette_score
# Splits the dataset into training and testing subsets. This is typically used in machine learning to evaluate model performance by training on one set and testing on another.
from sklearn.model_selection import train_test_split
# Import folium for interactive map visualization of spatial data
import folium

In [None]:
# Step 1: Mount Drive & Load AQ and Taxi Data
# Mount Google Drive to access files stored in the Drive
drive.mount('/content/drive')
# Set the path to the directory where the datasets are stored
base_path = '/content/drive/MyDrive/Datasets/'

# Initialize an empty list to collect data from each AQ CSV file
aq_dfs = []
# Loop through AQ_data_1.csv to AQ_data_19.csv to read each file
for i in range(1, 20):
    # Construct the file path for each AQ dataset
    file_path = os.path.join(base_path, f"AQ_data_{i}.csv")

    # Check if the file exists at the specified path
    if os.path.exists(file_path):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Rename the necessary columns for standardization
        df = df.rename(columns={
            'Latitude': 'latitude',
            'Longitude': 'longitude',
            'ReadingDateTimeUTC': 'timestamp',
            'PM25': 'pm25'
        })

        # Keep only the relevant columns needed for analysis
        df = df[['latitude', 'longitude', 'timestamp', 'pm25']]

        # Print how many records were loaded from the current file
        print(f" Loaded AQ_data_{i}.csv with {len(df)} records")

        # Append the DataFrame to the AQ list
        aq_dfs.append(df)
    else:
        # If the file doesn't exist, print a message
        print(f" File not found: {file_path}")

# Combine all individual AQ DataFrames into a single DataFrame
aq_df = pd.concat(aq_dfs, ignore_index=True)

# Display a preview of the first few records of the combined AQ data
print(" Preview of Combined AQ Data:")
print(aq_df.head())

# Print the total number of AQ records loaded
print(f" Total AQ Records Loaded: {len(aq_df)}")

# Load All Taxi Trips Data
# Construct the full file path to the Taxi_Trips dataset
taxi_file_path = os.path.join(base_path, 'Taxi_Trips.csv')

# Load selected columns from the taxi dataset for memory efficiency
taxi_df = pd.read_csv(taxi_file_path, usecols=[
    'Trip Start Timestamp',
    'Trip Seconds',
    'Trip Miles',
    'Pickup Centroid Latitude',
    'Pickup Centroid Longitude'
])

# Rename the columns for consistency with other datasets
taxi_df = taxi_df.rename(columns={
    'Trip Start Timestamp': 'trip_start_timestamp',
    'Pickup Centroid Latitude': 'pickup_latitude',
    'Pickup Centroid Longitude': 'pickup_longitude',
    'Trip Seconds': 'trip_seconds',
    'Trip Miles': 'trip_miles'
})

# Display a preview of the taxi data
print("\n Preview of Taxi Trips Data:")
print(taxi_df.head())

# Print the total number of taxi trip records loaded
print(f" Total Taxi Records Loaded: {len(taxi_df)}")
