In [1]:
import pandas as pd
import os


# Merge variables

In [29]:
# Define paths
data1_path = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data_output"
data2_path = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\dfl_traffic_count"
data3_path = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\tfl_crowding_data"

regions = ['Camden', 'City of London', 'Islington', 'Kensington and Chelsea', 'Lambeth', 'Southwark', 'Westminster']

## Each region

In [31]:
# Function to read CSV files containing region name in their names and merge them based on 'Date'
def read_and_merge_data1(folder_path, region):
    subfolders = ["NO_weighted", "NO2_weighted", "PM10_weighted", "PM25_weighted"]
    merged_df = pd.DataFrame()

    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        for root, dirs, files in os.walk(subfolder_path):
            for file in files:
                if file.endswith('.csv') and region in file:
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path)
                    air_type = subfolder.split('_')[0]
                    df.rename(columns={f"{air_type}_weighted_value(ug m-3)": f"{air_type}_value"}, inplace=True)
                    df['Date'] = df['Date'].str.strip()  # Remove leading/trailing spaces
                    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')  # Handle invalid dates
                    if merged_df.empty:
                        merged_df = df
                    else:
                        merged_df = pd.merge(merged_df, df[['Date', f"{air_type}_value"]], on='Date', how='outer')
    return merged_df

# Function to read CSV files containing region name in their names from data2 and merge with data1
def read_and_merge_data2(data1_df, folder_path, region):
    columns_of_interest = [
        'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis', 'buses_and_coaches', 'lgvs',
        'hgvs_2_rigid_axle', 'hgvs_3_rigid_axle', 'hgvs_4_or_more_rigid_axle', 'hgvs_3_or_4_articulated_axle',
        'hgvs_5_articulated_axle', 'hgvs_6_articulated_axle'
    ]
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv') and region in file:
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                df['count_date'] = df['count_date'].str.strip()  # Remove leading/trailing spaces
                df['Date'] = pd.to_datetime(df['count_date'], format='%Y/%m/%d', errors='coerce')  # Handle invalid dates
                df_major = df[df['road_type'] == 'Major'][['Date'] + columns_of_interest].add_suffix('_major')
                df_minor = df[df['road_type'] == 'Minor'][['Date'] + columns_of_interest].add_suffix('_minor')
                df_major.rename(columns={'Date_major': 'Date'}, inplace=True)
                df_minor.rename(columns={'Date_minor': 'Date'}, inplace=True)
                data1_df = pd.merge(data1_df, df_major, on='Date', how='left')
                data1_df = pd.merge(data1_df, df_minor, on='Date', how='left')
    return data1_df

# Function to read CSV files from data3, filter by 'Borough' == region, and merge with data1
def read_and_merge_data3(data1_df, folder_path, region):
    all_years_df = pd.DataFrame()
    for year in range(2019, 2025):
        file_name = f"filtered_StationFootfall_{year}_merged.csv"
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df = df[df['Borough'] == region]
            try:
                df['TravelDate'] = df['TravelDate'].astype(str).str.strip()  # Remove leading/trailing spaces
                df['TravelDate'] = pd.to_datetime(df['TravelDate'], format='%Y%m%d', errors='coerce')  # Handle invalid dates
            except ValueError:
                df['TravelDate'] = pd.to_datetime(df['TravelDate'], errors='coerce')
            df = df[['TravelDate', 'EntryTapCount', 'ExitTapCount']]
            all_years_df = pd.concat([all_years_df, df])
    all_years_df = all_years_df.groupby('TravelDate').sum().reset_index()
    data1_df = pd.merge(data1_df, all_years_df, left_on='Date', right_on='TravelDate', how='left')
    data1_df.drop(columns=['TravelDate'], inplace=True)
    return data1_df


# Process each region and save the merged data
for region in regions:
    data1_merged = read_and_merge_data1(data1_path, region)
    data1_merged = read_and_merge_data2(data1_merged, data2_path, region)
    final_merged_data = read_and_merge_data3(data1_merged, data3_path, region)
    
    output_path = f"D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data_output\\Correlation\\{region}_corr_merged.csv"
    final_merged_data.to_csv(output_path, index=False)
    print(f"Data for {region} merged and saved successfully.")

Data for Camden merged and saved successfully.
Data for City of London merged and saved successfully.
Data for Islington merged and saved successfully.
Data for Kensington and Chelsea merged and saved successfully.
Data for Lambeth merged and saved successfully.
Data for Southwark merged and saved successfully.
Data for Westminster merged and saved successfully.


## all regions merged

In [32]:
# Concatenate all region files into one final file
all_regions_df = pd.DataFrame()
for region in regions:
    file_path = f"D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data_output\\Correlation\\{region}_corr_merged.csv"
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df['Region'] = region
        all_regions_df = pd.concat([all_regions_df, df])

final_output_path = r"D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data_output\Correlation\corr_merged.csv"
all_regions_df.to_csv(final_output_path, index=False)

print("All regions data merged and saved successfully.")

All regions data merged and saved successfully.
