In [1]:

import pandas as pd
import numpy as np

# 1. Data Loading and Initial Inspection
print("\n--- Data Loading and Initial Inspection ---")
df = pd.read_csv("air_quality_data.csv")
print("DataFrame head:\n", df.head())
print("\nDataFrame info:\n")
df.info()
print("\nNumber of records:", len(df))
print("Number of unique stations:", df["Station ID"].nunique())

# 2. Data Cleaning and Preprocessing
print("\n--- Data Cleaning and Preprocessing ---")
print("Missing values before cleaning:\n", df.isnull().sum())

# Handle missing values: fill numerical NaNs with median, categorical with 'Unknown'
for col in ["PM2.5", "PM10", "O3", "NO2", "Temperature", "Humidity"]:
    df[col].fillna(df[col].median(), inplace=True)

# Correct outliers (e.g., negative concentrations) - already handled during generation, but good to re-check
for col in ["PM2.5", "PM10", "O3", "NO2", "Temperature", "Humidity"]:
    df[col] = df[col].apply(lambda x: max(0, x)) # Ensure non-negative

print("Missing values after cleaning:\n", df.isnull().sum())

# Convert Timestamp to datetime objects
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# 3. Basic Statistical Analysis
print("\n--- Basic Statistical Analysis ---")
pollutants = ["PM2.5", "PM10", "O3", "NO2"]
for pollutant in pollutants:
    print(f"\nDescriptive statistics for {pollutant}:\n{df[pollutant].describe()}")

# Correlation matrix
print("\nCorrelation Matrix (Pollutants and Environmental Factors):\n")
correlation_cols = pollutants + ["Temperature", "Humidity"]
print(df[correlation_cols].corr())

# 4. Station-wise Analysis
print("\n--- Station-wise Analysis ---")
# Station with highest average PM2.5
avg_pm25_by_station = df.groupby("Station ID")["PM2.5"].mean()
highest_pm25_station = avg_pm25_by_station.idxmax()
highest_pm25_value = avg_pm25_by_station.max()
print(f"Station with highest average PM2.5: {highest_pm25_station} ({highest_pm25_value:.2f} µg/m³)")

# Station with lowest average O3
avg_o3_by_station = df.groupby("Station ID")["O3"].mean()
lowest_o3_station = avg_o3_by_station.idxmin()
lowest_o3_value = avg_o3_by_station.min()
print(f"Station with lowest average O3: {lowest_o3_station} ({lowest_o3_value:.2f} ppb)")

# Daily average PM2.5 for a specific station (e.g., S003)
specific_station_id = "S003"
s003_df = df[df["Station ID"] == specific_station_id].copy()
s003_df["Date"] = s003_df["Timestamp"].dt.date
daily_avg_pm25_s003 = s003_df.groupby("Date")["PM2.5"].mean()

threshold = 50
days_exceeding_threshold = daily_avg_pm25_s003[daily_avg_pm25_s003 > threshold]
print(f"\nDays with PM2.5 exceeding {threshold} µg/m³ for Station {specific_station_id}:\n{days_exceeding_threshold}")

# 5. Time-series Analysis (Simplified)
print("\n--- Time-series Analysis (Simplified) ---")
# Hourly PM2.5 for a chosen station (S001) over a 24-hour period (first day)
station_s001_first_day = df[(df["Station ID"] == "S001") & (df["Timestamp"].dt.date == df["Timestamp"].dt.date.min())]
hourly_pm25_s001 = station_s001_first_day.set_index("Timestamp")["PM2.5"].resample("H").mean()

peak_hour_pm25 = hourly_pm25_s001.idxmax().hour
peak_pm25_value = hourly_pm25_s001.max()
print(f"Peak PM2.5 hour for Station S001 on {hourly_pm25_s001.index.min().date()}: {peak_hour_pm25}:00 ({peak_pm25_value:.2f} µg/m³)")

# 6. Custom Function Development: AQI Calculation
print("\n--- Custom Function Development: AQI Calculation ---")
def calculate_aqi_pm25(pm25_concentration):
    # Simplified AQI calculation for PM2.5 (example ranges)
    # This is a highly simplified version for demonstration purposes.
    # Real AQI calculation is more complex and involves breakpoints.
    if pm25_concentration <= 12.0:
        return "Good"
    elif pm25_concentration <= 35.4:
        return "Moderate"
    elif pm25_concentration <= 55.4:
        return "Unhealthy for Sensitive Groups"
    elif pm25_concentration <= 150.4:
        return "Unhealthy"
    elif pm25_concentration <= 250.4:
        return "Very Unhealthy"
    else:
        return "Hazardous"

# Apply the function to PM2.5 data
df["PM2.5_AQI_Category"] = df["PM2.5"].apply(calculate_aqi_pm25)

print("\nPM2.5 AQI Categories distribution:\n", df["PM2.5_AQI_Category"].value_counts())





--- Data Loading and Initial Inspection ---
DataFrame head:
   Station ID            Timestamp      PM2.5       PM10         O3        NO2  \
0       S001  2025-07-01 00:00:00  29.967142  37.926035  35.181508  30.661209   
1       S001  2025-07-01 01:00:00  40.792128  51.511521  26.244205  23.797920   
2       S001  2025-07-01 02:00:00  27.419623  11.300796  16.200657  16.063987   
3       S001  2025-07-01 03:00:00  15.919759  18.815444  41.725190  18.419566   
4       S001  2025-07-01 04:00:00  19.556173  41.663839  20.792051  22.629886   

   Temperature   Humidity  
0        23.83  57.658630  
1        22.68  55.342702  
2        19.94  63.142473  
3        25.34  45.752518  
4        22.00  57.083063  

DataFrame info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Station ID   3600 non-null   object 
 1   Timestamp    3600 non-null  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Peak PM2.5 hour for Station S001 on 2025-07-01: 1:00 (40.79 µg/m³)

--- Custom Function Development: AQI Calculation ---

PM2.5 AQI Categories distribution:
 PM2.5_AQI_Category
Moderate                          2700
Unhealthy for Sensitive Groups     542
Good                               336
Unhealthy                           11
Very Unhealthy                      11
Name: count, dtype: int64
