# AirQuality

## Import

In [None]:
import os
import pandas as pd

In [None]:
# 定义文件夹路径
input_folder = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon"
output_folder = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2"

# 确保输出文件夹存在
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# 定义研究区域文件夹
study_areas = ["Camden", "City of London", "Islington", "Kensington and Chelsea", "Lambeth", "Southwark", "Westminster"]

##  Unify TimeScale of each.csv 

In [None]:

# Process each study area
for area in study_areas:
    area_input_folder = os.path.join(input_folder, area)
    area_output_folder = os.path.join(output_folder, area)
    os.makedirs(area_output_folder, exist_ok=True)

    # Process each CSV file in the study area folder
    for filename in os.listdir(area_input_folder):
        if "NO2" in filename or "PM2.5" in filename:
            file_path = os.path.join(area_input_folder, filename)
            
            # Read the CSV file with encoding handling
            try:
                df = pd.read_csv(file_path, encoding='utf-8')
            except UnicodeDecodeError:
                try:
                    df = pd.read_csv(file_path, encoding='latin1')
                except Exception as e:
                    print(f"Failed to read {file_path} with error: {e}")
                    continue
            
            # Check and rename columns if necessary
            if 'Category' not in df.columns:
                print(f"'Category' column not found in {file_path}. Available columns: {df.columns}")
                continue
            
            # Clean data by dropping NaN values
            df = df.dropna()
            
            # Extract the date from 'Category' column
            df['Date'] = pd.to_datetime(df['Category'], dayfirst=True).dt.date
            
            # Determine the air quality type (NO2 or PM2.5)
            if "NO2" in filename:
                pollutant_type = "NO2"
                pollutant_column = "Nitrogen dioxide"
            elif "PM2.5" in filename:
                pollutant_type = "PM2.5"
                pollutant_column = "PM<sub>2.5</sub> particulates"
            else:
                continue
            
            # Calculate daily max, mean, and min values
            daily_stats = df.groupby('Date')[pollutant_column].agg(['mean', 'max', 'min']).reset_index()
            daily_stats.columns = ['Date', f'{pollutant_type}_mean', f'{pollutant_type}_max', f'{pollutant_type}_min']
            
            # Generate the new file name
            new_filename = filename.replace("Breathe London - ", "").replace(f" - {pollutant_type}.csv", f"_{pollutant_type}.csv")
            new_file_path = os.path.join(area_output_folder, new_filename)
            
            # Save the new CSV file
            daily_stats.to_csv(new_file_path, index=False, encoding='utf-8')

            print(f"Processed and saved: {new_file_path}")


# AQE

## Kensington and Chelsea

In [1]:
import os
import pandas as pd

In [None]:
import os
import pandas as pd

def read_and_process_aqe_data(base_folder, region, save_folder):
    coords_path = os.path.join(base_folder, 'coords_aqe.csv')
    
    try:
        coords_df = pd.read_csv(coords_path)
    except ValueError as e:
        print(f"Error reading {coords_path}: {e}")
        return
    
    pm25_data = []
    pm10_data = []
    
    region_folder = os.path.join(base_folder, region)
    for file_name in os.listdir(region_folder):
        if file_name.endswith('.csv'):
            file_path = os.path.join(region_folder, file_name)
            site_name = file_name.split('-')[1].replace('.csv', '')
            
            # Print the path for debugging
            print(f"Reading data from: {file_path}")
            
            try:
                # 读取并删除第一行标题
                data_df = pd.read_csv(file_path, skiprows=1)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
                continue
            
            data_df.columns = data_df.columns.str.strip()  # 去除列名中的空格
            data_df['End Date'] = pd.to_datetime(data_df['End Date'], format='%d/%m/%Y', errors='coerce')
            
            # Filter out rows with invalid dates
            data_df = data_df.dropna(subset=['End Date'])
            
            # data_df['End Time'] = pd.to_timedelta(data_df['End Time'])
            # data_df['ReadingDateTime'] = data_df['End Date'] + data_df['End Time']
            data_df['ReadingDateTime'] = data_df['End Date']
            
            data_df.set_index('ReadingDateTime', inplace=True)
            
            # 计算 PM2.5 的每日平均值
            pm25_daily = data_df['PM25'].resample('D').mean().reset_index()
            pm25_daily['Site'] = site_name
            pm25_daily.rename(columns={'PM25': 'Value'}, inplace=True)
            pm25_data.append(pm25_daily[['Site', 'ReadingDateTime', 'Value']])
            
            # 计算 PM10 的每日平均值
            pm10_daily = data_df['PM10'].resample('D').mean().reset_index()
            pm10_daily['Site'] = site_name
            pm10_daily.rename(columns={'PM10': 'Value'}, inplace=True)
            pm10_data.append(pm10_daily[['Site', 'ReadingDateTime', 'Value']])
    
    # 合并所有站点的 PM2.5 数据并按时间顺序排序
    pm25_df = pd.concat(pm25_data)
    pm25_df['ReadingDateTime'] = pd.to_datetime(pm25_df['ReadingDateTime'], format='%d/%m/%Y')
    pm25_df = pm25_df.sort_values(by=['Site', 'ReadingDateTime'])
    pm25_df['ReadingDateTime'] = pm25_df['ReadingDateTime'].dt.strftime('%d/%m/%Y')
    pm25_save_path = os.path.join(save_folder, f'{region}-PM2.5 Particulates (reference equivalent).csv')
    pm25_df.to_csv(pm25_save_path, index=False)
    
    # 合并所有站点的 PM10 数据并按时间顺序排序
    pm10_df = pd.concat(pm10_data)
    pm10_df['ReadingDateTime'] = pd.to_datetime(pm10_df['ReadingDateTime'], format='%d/%m/%Y')
    pm10_df = pm10_df.sort_values(by=['Site', 'ReadingDateTime'])
    pm10_df['ReadingDateTime'] = pm10_df['ReadingDateTime'].dt.strftime('%d/%m/%Y')
    pm10_save_path = os.path.join(save_folder, f'{region}-PM10 Particulates (reference equivalent).csv')
    pm10_df.to_csv(pm10_save_path, index=False)
    
    print(f"Processed PM2.5 data saved to {pm25_save_path}")
    print(f"Processed PM10 data saved to {pm10_save_path}")

# 设置基础文件夹和地区
base_folder = 'D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data\\AirQuality\\AQE'
save_folder = 'D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data\\AirQuality\\LondonAir\\Kensington and Chelsea'

region = 'Kensington and Chelsea'

# 读取和处理 AQE 数据
read_and_process_aqe_data(base_folder, region, save_folder)

## Islington

In [15]:

def read_and_process_pm25_data(base_folder, region, save_folder):
    region_folder = os.path.join(base_folder, region)
    pm25_data = []
    
    for file_name in os.listdir(region_folder):
        if file_name.endswith('_PM2.5.csv'):
            site_name = file_name.replace('_PM2.5.csv', '')
            file_path = os.path.join(region_folder, file_name)
            
            # Print the path for debugging
            print(f"Reading data from: {file_path}")
            
            try:
                data_df = pd.read_csv(file_path)
                print(f"Successfully read {file_path}")
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
                continue
            
            if 'Date' in data_df.columns and 'PM2.5_mean' in data_df.columns:
                data_df['Date'] = pd.to_datetime(data_df['Date'], format='%Y-%m-%d', errors='coerce')
                
                # # Filter out rows with invalid dates
                # data_df = data_df.dropna(subset=['Date'])
                
                if data_df.empty:
                    print(f"No valid data in {file_path} after filtering dates")
                    continue
                
                data_df['Site'] = site_name
                data_df.rename(columns={'Date': 'ReadingDateTime', 'PM2.5_mean': 'Value'}, inplace=True)
                pm25_data.append(data_df[['Site', 'ReadingDateTime', 'Value']])
            else:
                print(f"Required columns not found in {file_path}")
    
    if not pm25_data:
        print("No data to process")
        return
    
    # 合并所有站点的 PM2.5 数据并按时间顺序排序
    pm25_df = pd.concat(pm25_data)
    pm25_df = pm25_df.sort_values(by=['Site', 'ReadingDateTime'])
    pm25_df['ReadingDateTime'] = pm25_df['ReadingDateTime'].dt.strftime('%d/%m/%Y')
    pm25_save_path = os.path.join(save_folder, f'{region}-PM2.5 Particulates (reference equivalent).csv')
    pm25_df.to_csv(pm25_save_path, index=False)
    
    print(f"Processed PM2.5 data saved to {pm25_save_path}")

# 设置基础文件夹和地区
base_folder = 'D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data\\AirQuality\\BreatheLondon2'
save_folder = 'D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data\\AirQuality\\LondonAir\\Islington'

region = 'Islington'

# 读取和处理 PM2.5 数据
read_and_process_pm25_data(base_folder, region, save_folder)

Reading data from: D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2\Islington\Air on the Green_PM2.5.csv
Successfully read D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2\Islington\Air on the Green_PM2.5.csv
Reading data from: D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2\Islington\Canonbury Primary_PM2.5.csv
Successfully read D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2\Islington\Canonbury Primary_PM2.5.csv
Reading data from: D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQuality\BreatheLondon2\Islington\Finsbury Park_PM2.5.csv
Successfully read D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\AirQu