In [1]:
# Step 1: Load libraries
import pandas as pd
import os

In [11]:
# Step 2: Load cleaned traffic data
# --- Define the path to cleaned traffic data ---
TRAFFIC_DATA_PATH = "../data/processed/cleaned_traffic_data_aggregated_with_holidays.csv"

print(f"--- Loading Cleaned Traffic Data from: {TRAFFIC_DATA_PATH} ---")

try:
    # Load the CSV file
    df_traffic = pd.read_csv(TRAFFIC_DATA_PATH)

    # --- IMPORTANT: Convert 'Datetime' column to datetime objects ---
    # This is crucial for extracting the date part later
    df_traffic['Datetime'] = pd.to_datetime(df_traffic['Datetime'], format='%d/%m/%Y %H:%M', errors='coerce')
    
    # Drop rows where Datetime conversion failed (if any)
    original_rows = len(df_traffic)
    df_traffic.dropna(subset=['Datetime'], inplace=True)
    if len(df_traffic) < original_rows:
        print(f"Warning: Dropped {original_rows - len(df_traffic)} rows due to invalid Datetime format.")


    print("Traffic data loaded successfully.")

    # Display the first few rows and info
    print("\nDataFrame Head:")
    display(df_traffic.head())

    print("\nDataFrame Info:")
    df_traffic.info()

except FileNotFoundError:
    print(f"Error: File not found at {TRAFFIC_DATA_PATH}")
    # Stop execution if the file isn't found
    raise 
except Exception as e:
    print(f"An error occurred while loading the traffic data: {type(e).__name__} - {e}")
    raise

--- Loading Cleaned Traffic Data from: ../data/processed/cleaned_traffic_data_aggregated_with_holidays.csv ---
Traffic data loaded successfully.

DataFrame Head:


Unnamed: 0,Datetime,Motorcycle,Car,Truck_Lorry,Bus,Date,Is_Holiday,Day_Of_Week,Is_Weekend,Hour,Month
0,2024-01-01 00:00:00,77,291,0,0,1/1/2024,1,0,0,0,1
1,2024-01-01 01:00:00,102,291,0,0,1/1/2024,1,0,0,1,1
2,2024-01-01 02:00:00,161,383,0,0,1/1/2024,1,0,0,2,1
3,2024-01-01 03:00:00,90,287,0,0,1/1/2024,1,0,0,3,1
4,2024-01-01 04:00:00,113,301,1,0,1/1/2024,1,0,0,4,1



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8781 entries, 0 to 8780
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Datetime     8781 non-null   datetime64[ns]
 1   Motorcycle   8781 non-null   int64         
 2   Car          8781 non-null   int64         
 3   Truck_Lorry  8781 non-null   int64         
 4   Bus          8781 non-null   int64         
 5   Date         8781 non-null   object        
 6   Is_Holiday   8781 non-null   int64         
 7   Day_Of_Week  8781 non-null   int64         
 8   Is_Weekend   8781 non-null   int64         
 9   Hour         8781 non-null   int64         
 10  Month        8781 non-null   int64         
dtypes: datetime64[ns](1), int64(9), object(1)
memory usage: 754.7+ KB


In [17]:
# Step 3: Load weather data
# --- Define the path to your weather data ---
WEATHER_DATA_PATH = "../data/raw/Weather 2024.csv"

print(f"\n--- Loading and Processing Weather Data from: {WEATHER_DATA_PATH} ---")

try:
    weather = pd.read_csv(WEATHER_DATA_PATH)
    print(f"Loaded holiday data: {len(weather)} rows.")

    # --- Clean Holiday Dates ---
    weather['Datetime'] = pd.to_datetime(weather['datetime'])
    
    # Select and rename columns for clarity before merge
    weather_to_merge = weather[[
        'Datetime', 
        'temperature_2m', 
        'relative_humidity_2m', 
        'weathercode', 
        'windspeed_10m'
    ]].copy() # Use .copy() to avoid SettingWithCopyWarning
    
    # --- Rename columns using rename and lambda ---
    def capitalize_col_name(col_name):
      parts = col_name.split('_')
      capitalized_parts = []
      for i, part in enumerate(parts):
        # Keep '2m' or '10m' as is if they are the last part
        if i == len(parts) - 1 and part in ['2m', '10m']:
          capitalized_parts.append(part)
        else:
          capitalized_parts.append(part.capitalize()) # Capitalize others
      return '_'.join(capitalized_parts)
    
    # Apply the function to rename columns
    weather_to_merge = weather_to_merge.rename(columns=capitalize_col_name)
    
    print("\nRenamed columns:", weather_to_merge.columns.tolist())
    
    # Display the DataFrame with new column names
    display(weather_to_merge.head())

except FileNotFoundError:
    print(f"Error: Holiday file not found at {WEATHER_DATA_PATH}")
    raise
except Exception as e:
    print(f"An error occurred during holiday processing: {type(e).__name__} - {e}")
    raise


--- Loading and Processing Weather Data from: ../data/raw/Weather 2024.csv ---
Loaded holiday data: 8784 rows.

Renamed columns: ['Datetime', 'Temperature_2m', 'Relative_Humidity_2m', 'Weathercode', 'Windspeed_10m']


Unnamed: 0,Datetime,Temperature_2m,Relative_Humidity_2m,Weathercode,Windspeed_10m
0,2024-01-01 00:00:00,25.3,92,3,5.9
1,2024-01-01 01:00:00,25.0,91,3,6.3
2,2024-01-01 02:00:00,24.7,93,1,7.5
3,2024-01-01 03:00:00,24.5,94,2,8.4
4,2024-01-01 04:00:00,24.3,94,2,9.7


In [19]:
# Step 4: Merge weather data with the cleaned traffic data(combined with holidays)
# Check if df_traffic and weather_to_merge exist
if ('df_traffic' in locals() and not df_traffic.empty and 
    'weather_to_merge' in locals() and not weather_to_merge.empty):
    
    print("\n--- Merging Weather Data into Traffic Data ---")

    # Ensure both 'Datetime' columns are datetime objects before merging
    if not pd.api.types.is_datetime64_any_dtype(df_traffic['Datetime']):
        df_traffic['Datetime'] = pd.to_datetime(df_traffic['Datetime'], errors='coerce')
        df_traffic.dropna(subset=['Datetime'], inplace=True)
        print("Converted df_traffic['Datetime'] to datetime objects.")
        
    if not pd.api.types.is_datetime64_any_dtype(weather_to_merge['Datetime']):
        weather_to_merge['Datetime'] = pd.to_datetime(weather_to_merge['Datetime'], errors='coerce')
        weather_to_merge.dropna(subset=['Datetime'], inplace=True)
        print("Converted weather_to_merge['Datetime'] to datetime objects.")

    # Perform the left merge on the 'Datetime' column
    df_merged = pd.merge(
        df_traffic,
        weather_to_merge,
        on='Datetime',    # Column to match in both DataFrames
        how='left'        # Keep all rows from df_traffic
    )

    print("Merge completed.")

    # --- Verification ---
    print(f"Original traffic rows: {len(df_traffic)}")
    print(f"Merged DataFrame rows: {len(df_merged)}")
    
    # Check for NaNs introduced by the merge (indicates missing weather data for some hours)
    weather_cols = ['Temperature_2m', 'Relative_Humidity_2m', 'Weathercode', 'Windspeed_10m']
    nan_counts = df_merged[weather_cols].isna().sum()
    print("\nMissing weather values after merge:")
    print(nan_counts)

    print("\nMerged DataFrame with weather data (head):")
    display(df_merged.head())
    df_merged.info()

else:
     print("\n'df_traffic' or 'weather_to_merge' DataFrame not found or empty. Run previous steps first.")


--- Merging Weather Data into Traffic Data ---
Merge completed.
Original traffic rows: 8781
Merged DataFrame rows: 8781

Missing weather values after merge:
Temperature_2m          0
Relative_Humidity_2m    0
Weathercode             0
Windspeed_10m           0
dtype: int64

Merged DataFrame with weather data (head):


Unnamed: 0,Datetime,Motorcycle,Car,Truck_Lorry,Bus,Date,Is_Holiday,Day_Of_Week,Is_Weekend,Hour,Month,Temperature_2m,Relative_Humidity_2m,Weathercode,Windspeed_10m
0,2024-01-01 00:00:00,77,291,0,0,1/1/2024,1,0,0,0,1,25.3,92,3,5.9
1,2024-01-01 01:00:00,102,291,0,0,1/1/2024,1,0,0,1,1,25.0,91,3,6.3
2,2024-01-01 02:00:00,161,383,0,0,1/1/2024,1,0,0,2,1,24.7,93,1,7.5
3,2024-01-01 03:00:00,90,287,0,0,1/1/2024,1,0,0,3,1,24.5,94,2,8.4
4,2024-01-01 04:00:00,113,301,1,0,1/1/2024,1,0,0,4,1,24.3,94,2,9.7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8781 entries, 0 to 8780
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Datetime              8781 non-null   datetime64[ns]
 1   Motorcycle            8781 non-null   int64         
 2   Car                   8781 non-null   int64         
 3   Truck_Lorry           8781 non-null   int64         
 4   Bus                   8781 non-null   int64         
 5   Date                  8781 non-null   object        
 6   Is_Holiday            8781 non-null   int64         
 7   Day_Of_Week           8781 non-null   int64         
 8   Is_Weekend            8781 non-null   int64         
 9   Hour                  8781 non-null   int64         
 10  Month                 8781 non-null   int64         
 11  Temperature_2m        8781 non-null   float64       
 12  Relative_Humidity_2m  8781 non-null   int64         
 13  Weathercode       

In [21]:
# Step 5: Save the dataframe
if 'df_merged' in locals() and not df_merged.empty:
    FINAL_MERGED_PATH = "../data/processed/traffic_data_with_weather_holidays.csv"
    try:
        # Select columns to save (you might want to drop intermediate cols like 'Date', 'DayOfWeek')
        cols_to_save = [col for col in df_merged.columns if col not in ['Date', 'DayOfWeek']] # Example
        df_merged_to_save = df_merged[cols_to_save]

        df_merged_to_save.to_csv(FINAL_MERGED_PATH, index=False)
        print(f"\nFinal merged data saved to: {FINAL_MERGED_PATH}")
    except Exception as e:
        print(f"Error saving file: {e}")
else:
    print("\nNo final merged DataFrame ('df_merged') to save.")


Final merged data saved to: ../data/processed/traffic_data_with_weather_holidays.csv
