In [1]:
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [2]:
API_KEY = "24e036c0f2b7c60d7cb7657d2979f348c4a5b08d88c239d1f9cd61f03f76fe4d"
LOCATION_ID = 8833

aqi_breakpoints_file = "aqi_breakpoints.csv"

API TEST

In [3]:
# Step 1: Fetch real-time PM2.5 data from OpenAQ API
def fetch_air_quality_data(api_key, location_id):
    # Fetch sensors for the given location
    location_url = f"https://api.openaq.org/v3/locations/{location_id}/sensors"
    headers = {"X-API-Key": api_key}
    response = requests.get(location_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        sensors_data = response.json()

        # Filter sensors to find the one that measures PM2.5
        pm25_sensors = []
        for sensor in sensors_data["results"]:
            if sensor["parameter"]["name"] == "pm25":
                pm25_sensors.append(sensor)

        if pm25_sensors:
            print(f"Found {len(pm25_sensors)} PM2.5 sensor(s)")
            for sensor in pm25_sensors:
                print(f"Sensor ID: {sensor['id']}, Sensor Name: {sensor['name']}")

            # Get the daily mean PM2.5 value for the first sensor
            sensor_id = pm25_sensors[0]["id"]
            daily_data_url = f"https://api.openaq.org/v3/sensors/{sensor_id}/days"
            daily_data_response = requests.get(daily_data_url, headers=headers)

            if daily_data_response.status_code == 200:
                daily_data = daily_data_response.json()
                # Extract the average daily PM2.5 value
                if daily_data["results"]:
                    daily_mean = daily_data["results"][0]["value"]
                    print(f"Daily Mean PM2.5 Value: {daily_mean} µg/m³")
                    return daily_mean
                else:
                    print("No daily data available for PM2.5.")
                    return None
            else:
                print("Error fetching daily mean data")
                return None
        else:
            print("No PM2.5 sensors found at this location.")
            return None
    else:
        print("Error fetching sensor data for the location.")
        return None


In [4]:
def generate_dynamic_data_from_dataset(aqi_breakpoints_file, pm25_dataset_file, parameter="PM2.5 - Local Conditions"):
    # Load the AQI breakpoints CSV file
    aqi_breakpoints = pd.read_csv(aqi_breakpoints_file)
    
    # Load the PM2.5 dataset
    pm25_data = pd.read_csv(pm25_dataset_file)
    
    # Ensure the files have the necessary columns
    required_columns = ["Parameter", "Low Breakpoint", "High Breakpoint", "AQI Category"]
    if not all(col in aqi_breakpoints.columns for col in required_columns):
        raise ValueError(f"The AQI breakpoints CSV file must contain the following columns: {required_columns}")
    
    # Ensure the PM2.5 dataset has the necessary columns
    if "PM2.5 Mean" not in pm25_data.columns or "Date" not in pm25_data.columns:
        raise ValueError("The PM2.5 dataset must contain 'PM2.5 Mean' and 'Date' columns.")
    
    # Filter for the specified parameter
    relevant_rows = aqi_breakpoints[aqi_breakpoints["Parameter"] == parameter]
    if relevant_rows.empty:
        raise ValueError(f"No data found for the specified parameter: {parameter}")
    
    # Categorize each day's PM2.5 mean value
    risk_levels = []
    for index, row in pm25_data.iterrows():
        pm25_value = row["PM2.5 Mean"]
        # Find the AQI category based on the PM2.5 value
        risk_row = relevant_rows[
            (relevant_rows["Low Breakpoint"] <= pm25_value) & 
            (relevant_rows["High Breakpoint"] >= pm25_value)
        ]
        if not risk_row.empty:
            risk_levels.append(risk_row.iloc[0]["AQI Category"])
        else:
            risk_levels.append("Out of Range")  # Handle values outside of defined breakpoints
    
    # Add the risk levels to the dataset
    pm25_data["RiskLevel"] = risk_levels
    
    # Return the dataset with the added RiskLevel
    return pm25_data


In [5]:
# Step 3: Train the KNN classifier
def train_classifier(data):
    # Use the correct column name from your dataset
    X = data[["PM2.5 Mean"]]  # Changed from "PM2.5" to "PM2.5 Mean"
    y = data["RiskLevel"]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    
    # Evaluate the model
    accuracy = knn.score(X_test, y_test)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    return knn


In [6]:
# Step 4: Predict health risk level for real-time PM2.5 data
def predict_risk_level(knn_model, pm25_value):
    if pm25_value is not None:
        prediction = knn_model.predict([[pm25_value]])
        print(f"The Air Quality Index category for PM2.5 = {pm25_value}: {prediction[0]}")
    else:
        print("No valid PM2.5 data to predict risk level.")

In [7]:
# Fetch real-time data
real_time_pm25 = fetch_air_quality_data(API_KEY, LOCATION_ID)

# Generate synthetic dataset
synthetic_data = generate_dynamic_data_from_dataset(aqi_breakpoints_file, "China_pm25_daily_mean_2020_2024.csv")

# Train the classifier
knn_model = train_classifier(synthetic_data)

# Predict risk level for real-time data
predict_risk_level(knn_model, real_time_pm25)

Found 1 PM2.5 sensor(s)
Sensor ID: 25754, Sensor Name: pm25 µg/m³
Daily Mean PM2.5 Value: 38.2 µg/m³
Model Accuracy: 1.00
The Air Quality Index category for PM2.5 = 38.2: UNHEALTHY FOR SENSITIVE




ENHANCING Data Quality, and Representativeness: 
- Currently, the model uses only PM2.5 values. Adding other pollutants (e.g., PM10, NO2, CO)
- The synthetic dataset is limited in realism and variability. Fetch historical air quality and health impact data from APIs or other public datasets to train the model on real-world scenarios.



TESTING how accurate the Classifier is on other datasets such as AQI in SF:

In [8]:
# Step 1: Load the San Francisco dataset for testing
sf_dataset_file = "SF_pm25_daily_mean_2020_2024.csv"  # Replace with your actual file path
sf_data = pd.read_csv(sf_dataset_file)

# Ensure the San Francisco dataset has the required columns
if "PM2.5 Mean" not in sf_data.columns or "Date" not in sf_data.columns:
    raise ValueError("The San Francisco PM2.5 dataset must contain 'PM2.5 Mean' and 'Date' columns.")

# Step 2: Generate the risk levels for the San Francisco dataset using the same function
sf_data_with_risk = generate_dynamic_data_from_dataset(aqi_breakpoints_file, sf_dataset_file)

# Step 3: Use the trained KNN model to predict the risk levels for the San Francisco dataset
X_sf = sf_data_with_risk[["PM2.5 Mean"]]
y_sf_true = sf_data_with_risk["RiskLevel"]

# Predict the risk levels using the trained KNN model
y_sf_pred = knn_model.predict(X_sf)

# Evaluate the accuracy of the model on the San Francisco dataset
from sklearn.metrics import accuracy_score
sf_accuracy = accuracy_score(y_sf_true, y_sf_pred)

print(f"San Francisco Test Accuracy: {sf_accuracy:.2f}")


San Francisco Test Accuracy: 0.98
