# import

In [19]:
import os
import pandas as pd
import numpy as np
from scipy.interpolate import UnivariateSpline


# Step 1: Read and filter the original files

In [17]:

# Step 1: Read and filter the original files
dfl_file_path = 'Data/dfl_traffic_count'
years_to_filter = [2018, 2019, 2021, 2022, 2023, 2024]

# List all files in the directory
file_paths = [os.path.join(dfl_file_path, file) for file in os.listdir(dfl_file_path) if file.endswith('.csv')]

for file_path in file_paths:
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Check if 'count_date' column exists
    if 'count_date' not in df.columns:
        print(f"'count_date' column not found in {file_path}. Skipping this file.")
        continue
    
    # Convert 'count_date' column to datetime type
    df['count_date'] = pd.to_datetime(df['count_date'], errors='coerce')
    
    # Filter rows for the specified years
    filtered_df = df[df['count_date'].dt.year.isin(years_to_filter)]
    
    # Construct new file name for the filtered data
    new_file_path = file_path.replace('.csv', '_filtered.csv')
    
    # Save the filtered data to a new file
    filtered_df.to_csv(new_file_path, index=False)
    print(f"Filtered data saved to {new_file_path}")


Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_103_filtered.csv
Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_107_filtered.csv


  df = pd.read_csv(file_path)


Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_109_filtered.csv
Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_110_filtered.csv
Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_145_filtered.csv
Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_174_filtered.csv
Filtered data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_96_filtered.csv


# Step 2: Processing filtered files

In [24]:

# Step 2: Process the filtered files
filtered_file_paths = [file.replace('.csv', '_filtered.csv') for file in file_paths]

for filtered_file_path in filtered_file_paths:
    # Check if filtered file exists
    if not os.path.exists(filtered_file_path):
        print(f"Filtered file {filtered_file_path} does not exist. Skipping.")
        continue
    
    # Read the filtered CSV file
    df = pd.read_csv(filtered_file_path, encoding='ISO-8859-1')  # Adjust encoding if necessary
    
    # Convert 'count_date' to datetime and extract the date part
    df['count_date'] = pd.to_datetime(df['count_date'], errors='coerce')
    df['date'] = df['count_date'].dt.date
    
    # Group by day and calculate the mean for specific columns
    columns_to_interpolate = [
        'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis', 
        'buses_and_coaches', 'lgvs', 'hgvs_2_rigid_axle', 'hgvs_3_rigid_axle', 
        'hgvs_4_or_more_rigid_axle', 'hgvs_3_or_4_articulated_axle', 
        'hgvs_5_articulated_axle', 'hgvs_6_articulated_axle', 'all_hgvs', 
        'all_motor_vehicles'
    ]
    daily_df = df.groupby('date')[columns_to_interpolate].mean().reset_index()
    
    # Create a complete date range
    full_date_range = pd.date_range(start=daily_df['date'].min(), end=daily_df['date'].max())
    
    # Reindex to have a continuous date range
    daily_df = daily_df.set_index('date').reindex(full_date_range).reset_index()
    daily_df.rename(columns={'index': 'date'}, inplace=True)
    
    # Interpolate missing values using spline interpolation for specified columns
    for column in columns_to_interpolate:
        mask = np.isfinite(daily_df[column])
        x = np.arange(len(daily_df))
        spline = UnivariateSpline(x[mask], daily_df.loc[mask, column], s=0)
        daily_df[column] = spline(x)
        daily_df[column] = daily_df[column].round(2)  # Round to 2 decimal places
    
    # Save the processed data to a new CSV file
    processed_file_path = filtered_file_path.replace('_filtered.csv', '_processed.csv')
    daily_df.to_csv(processed_file_path, index=False)
    
    print(f"Processed data saved to {processed_file_path}")


Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_103_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_107_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_109_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_110_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_145_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_174_processed.csv
Processed data saved to Data/dfl_traffic_count\dft_rawcount_local_authority_id_96_processed.csv
