In [2]:
import pandas as pd
from functools import reduce
import warnings
warnings.filterwarnings("ignore")

In [4]:


def create_master_dataset(file_paths: dict, base_file_key: str, output_filepath: str) -> None:
    """
    Loads multiple cleaned CSV files, merges them into a single master dataset,
    and saves the result.

    This function performs the following steps:
    1.  Loads all specified CSV files into pandas DataFrames.
    2.  Ensures the 'date' column in each DataFrame is in datetime format.
    3.  Sequentially merges all dataframes into the base dataframe using a left merge.
    4.  Renames and reorders the columns to match the final desired structure.
    5.  Saves the final master dataset to a new CSV file.

    Args:
        file_paths (dict): A dictionary mapping a descriptive key to its CSV file path.
        base_file_key (str): The key from file_paths to be used as the base for merging.
        output_filepath (str): The file path to save the final master dataset.
    """
    try:
        # --- 1. Load All Cleaned Datasets ---
        dataframes = {}
        for key, path in file_paths.items():
            df = pd.read_csv(path)
            # Ensure the 'date' column is in datetime format for accurate merging
            df['date'] = pd.to_datetime(df['date'])
            dataframes[key] = df
        
        print("All cleaned datasets loaded successfully.")

        # --- 2. Sequentially Merge DataFrames ---
        # Start with the base DataFrame (vehicle registrations)
        base_df = dataframes[base_file_key]
        
        # Create a list of other DataFrames to merge
        dfs_to_merge = [dataframes[key] for key in file_paths if key != base_file_key]
        
        # Use functools.reduce to iteratively merge all dataframes into the base
        df_merged = reduce(lambda left, right: pd.merge(left, right, on='date', how='left'), 
                           dfs_to_merge, 
                           base_df)

        print("All datasets merged successfully.")

        # --- 3. Final Column Renaming and Reordering ---
        # Rename columns to match the final proposal
        df_merged.rename(columns={
            'bev_units': 'BEV_Registrations',
            'total_cars': 'Total_Vehicle_Registrations',
            'bev_share': 'BEV_Share',
        }, inplace=True)
        
        # Define the final column order (excluding Bank_Rate)
        final_column_order = [
            'date',
            'BEV_Registrations',
            'Total_Vehicle_Registrations',
            'BEV_Share',
            'RHDI_per_head',
            'CPI',
            'Bank_Rate',
            'Petrol_Price',
            'Diesel_Price',
            'PiCG_Amount_GBP',
            'picg_active',
            'GT_Awareness_General',
            'GT_Awareness_Consideration',
            'GT_Infrastructure_Concern',
            'GT_Range_Anxiety'
        ]
        
        # Select only the required columns and drop the extras from the merge
        df_final = df_merged[final_column_order]

        # --- 4. Save the Final Master Dataset ---
        df_final.to_csv(output_filepath, index=False)
        
        print(f"\nMaster dataset created and saved to '{output_filepath}'")
        
    except FileNotFoundError as e:
        print(f"Error: A required cleaned file was not found: {e}")
        print("Please ensure all previous cleaning steps have been completed successfully.")
    except Exception as e:
        print(f"An error occurred during the final merge: {e}")

# --- How to use this script ---
# 1. Save this code as a Python file (e.g., `create_master.py`).
# 2. Ensure all the cleaned CSV files listed below are in the same directory.
# 3. Run the script. It will generate the 'master_dataset.csv' file.

# Dictionary mapping a key to each cleaned file's path
CLEANED_FILES = {
        'vehicles': 'veh1153_bev_uk_localtrend.csv',
        'rhdi': 'rhdi_cleaned_2011_2024.csv',
        'cpi': 'cpi_cleaned_2011_2024.csv',
        'bank_rate': 'bank_rate_cleaned.csv',
        'fuel': 'fuel_prices_cleaned.csv',
        'picg': 'picg_cleaned.csv',
        'gt': 'google_trends_cleaned.csv'
    }
    
# Define the output file path
MASTER_OUTPUT_FILE = 'master_dataset.csv'
    
# Execute the merge function
create_master_dataset(CLEANED_FILES, base_file_key='vehicles', output_filepath=MASTER_OUTPUT_FILE)

All cleaned datasets loaded successfully.
All datasets merged successfully.

Master dataset created and saved to 'master_dataset.csv'


In [3]:
df = pd.read_csv("C:\\Users\\Ashish Siwach\\OneDrive - University of Exeter\\Dissertation_Cld\\Datasets\\master_dataset.csv")
df.head()

Unnamed: 0,date,BEV_Registrations,Total_Vehicle_Registrations,BEV_Share,Charging_Infrastructure,RHDI_per_head,CPI,Bank_Rate,Petrol_Price,Diesel_Price,PiCG_Amount_GBP,picg_active,GT_Awareness_General,GT_Awareness_Consideration,GT_Infrastructure_Concern,GT_Range_Anxiety
0,2011-01-31,54,424100,0.000127,0,5129.0,91.3,0.5,127.4,131.86,5000.0,1,12,7,13,0
1,2011-02-28,36,360400,0.0001,0,5088.0,92.0,0.5,129.0,134.07,5000.0,1,7,3,0,0
2,2011-03-31,320,662000,0.000483,0,5047.0,92.2,0.5,132.25,138.43,5000.0,1,10,5,10,0
3,2011-04-30,320,434400,0.000737,0,5063.3333,93.2,0.5,134.29,140.81,5000.0,1,7,3,0,0
4,2011-05-31,38,447300,8.5e-05,0,5079.6667,93.4,0.5,136.28,141.53,5000.0,1,7,4,0,0


In [None]:
import pandas as pd

def update_master_with_infra(master_filepath: str, infra_filepath: str, output_filepath: str) -> None:
    """
    Updates an existing master dataset by merging in the charging infrastructure data.

    Args:
        master_filepath (str): The file path for the existing master dataset.
        infra_filepath (str): The file path for the cleaned charging infra data.
        output_filepath (str): The file path to save the updated master dataset.
    """
    try:
        # --- 1. Load the Datasets ---
        print(f"Loading existing master dataset from '{master_filepath}'...")
        df_master = pd.read_csv(master_filepath)
        
        print(f"Loading charging infrastructure data from '{infra_filepath}'...")
        df_infra = pd.read_csv(infra_filepath)
        
        # --- 2. Ensure Date Columns are in Datetime Format ---
        df_master['date'] = pd.to_datetime(df_master['date'])
        df_infra['date'] = pd.to_datetime(df_infra['date'])

        # --- 3. Perform the Merge ---
        # Left merge to ensure all rows from the master dataset are kept.
        print("Merging the datasets...")
        df_updated = pd.merge(df_master, df_infra, on='date', how='left')
        
        # Handle potential NaNs if date ranges don't perfectly align.
        df_updated['Charging_Infrastructure'].fillna(0, inplace=True)

        # --- 4. Reorder Columns for Final Output ---
        # Place the new column in a logical position
        final_column_order = [
            'date',
            'BEV_Registrations',
            'Total_Vehicle_Registrations',
            'BEV_Share',
            'Charging_Infrastructure', # New column added here
            'RHDI_per_head',
            'CPI',
            'Bank_Rate',
            'Petrol_Price',
            'Diesel_Price',
            'PiCG_Amount_GBP',
            'picg_active',
            'GT_Awareness_General',
            'GT_Awareness_Consideration',
            'GT_Infrastructure_Concern',
            'GT_Range_Anxiety'
        ]
        
        # Check if all columns exist before reordering
        final_columns_exist = [col for col in final_column_order if col in df_updated.columns]
        df_final = df_updated[final_columns_exist]

        # --- 5. Save the Updated Master Dataset ---
        df_final.to_csv(output_filepath, index=False)
        
        print(f"\nSuccessfully merged charging data. The master dataset has been updated and saved to '{output_filepath}'")
        
    except FileNotFoundError as e:
        print(f"Error: A required file was not found: {e.filename}")
        print("Please ensure both the master and charging data files are available.")
    except Exception as e:
        print(f"An error occurred: {e}")


MASTER_FILE = "C:\\Users\\Ashish Siwach\\OneDrive - University of Exeter\\Dissertation_Cld\\Datasets\\master_dataset.csv"
INFRA_FILE = "charging_infra_cleaned.csv"
    
# Execute the function, saving the result to the same master file
update_master_with_infra(MASTER_FILE, INFRA_FILE, MASTER_FILE)

Loading existing master dataset from 'master_dataset.csv'...
Loading charging infrastructure data from 'charging_infra_cleaned.csv'...
Merging the datasets...

Successfully merged charging data. The master dataset has been updated and saved to 'master_dataset.csv'


In [None]:
df = pd.read_csv("C:\\Users\\Ashish Siwach\\OneDrive - University of Exeter\\Dissertation_Cld\\Datasets\\master_dataset.csv")
df.head(10)

Unnamed: 0,date,BEV_Registrations,Total_Vehicle_Registrations,BEV_Share,Charging_Infrastructure,RHDI_per_head,CPI,Bank_Rate,Petrol_Price,Diesel_Price,PiCG_Amount_GBP,picg_active,GT_Awareness_General,GT_Awareness_Consideration,GT_Infrastructure_Concern,GT_Range_Anxiety
0,2011-01-31,54,424100,0.000127,0,5129.0,91.3,0.5,127.4,131.86,5000.0,1,12,7,13,0
1,2011-02-28,36,360400,0.0001,0,5088.0,92.0,0.5,129.0,134.07,5000.0,1,7,3,0,0
2,2011-03-31,320,662000,0.000483,0,5047.0,92.2,0.5,132.25,138.43,5000.0,1,10,5,10,0
3,2011-04-30,320,434400,0.000737,0,5063.3333,93.2,0.5,134.29,140.81,5000.0,1,7,3,0,0
4,2011-05-31,38,447300,8.5e-05,0,5079.6667,93.4,0.5,136.28,141.53,5000.0,1,7,4,0,0
5,2011-06-30,218,479000,0.000455,0,5096.0,93.3,0.5,135.59,139.54,5000.0,1,6,3,0,0
6,2011-07-31,336,429200,0.000783,0,5093.3333,93.3,0.5,134.56,138.85,5000.0,1,6,2,0,0
7,2011-08-31,36,357300,0.000101,0,5090.6667,93.8,0.5,135.47,139.9,5000.0,1,6,3,0,0
8,2011-09-30,38,629800,6e-05,0,5088.0,94.4,0.5,135.17,139.59,5000.0,1,6,3,0,0
9,2011-10-31,218,431800,0.000505,0,5078.3333,94.5,0.5,134.67,139.9,5000.0,1,5,3,0,0
