In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('prepared_data/preprocessed_full_data.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23385313 entries, 0 to 23385312
Data columns (total 39 columns):
 #   Column             Dtype  
---  ------             -----  
 0   detector_id        int64  
 1   hour               int64  
 2   quality            float64
 3   veh_total          int64  
 4   speed_total        float64
 5   veh_cars           int64  
 6   speed_cars         float64
 7   veh_trucks         int64  
 8   speed_trucks       float64
 9   timestamp          object 
 10  year               int64  
 11  day_of_week        int64  
 12  is_weekend         int64  
 13  month              int64  
 14  season             object 
 15  is_holiday         int64  
 16  is_rush_hour       int64  
 17  is_school_holiday  int64  
 18  free_flow_speed    float64
 19  speed_ratio        float64
 20  congestion_index   float64
 21  road_name          object 
 22  road_position      object 
 23  direction          object 
 24  lane               object 
 25  lon             

Unnamed: 0,detector_id,hour,quality,veh_total,speed_total,veh_cars,speed_cars,veh_trucks,speed_trucks,timestamp,...,precipitation,relative_humidity,visibility,cloud_cover,condition,icon,is_rain,is_snow,is_fog,is_dark
0,100101010000369,0,1.0,138,75.1,135,75.1,3,74.0,2015-01-01 00:00:00,...,0.0,99.0,4000.0,100.0,dry,cloudy,0.0,0.0,0.0,1.0
1,100101010000874,0,1.0,45,47.7,40,49.1,5,36.8,2015-01-01 00:00:00,...,0.0,99.0,4000.0,100.0,dry,cloudy,0.0,0.0,0.0,1.0
2,100101010000975,0,1.0,33,50.5,32,50.5,1,50.0,2015-01-01 00:00:00,...,0.0,99.0,4000.0,100.0,dry,cloudy,0.0,0.0,0.0,1.0
3,100101010001076,0,1.0,55,52.5,52,53.5,3,36.0,2015-01-01 00:00:00,...,0.0,99.0,4000.0,100.0,dry,cloudy,0.0,0.0,0.0,1.0
4,100101010001177,0,1.0,58,49.2,42,56.3,16,30.5,2015-01-01 00:00:00,...,0.0,99.0,4000.0,100.0,dry,cloudy,0.0,0.0,0.0,1.0


The problem is obviously with the cloud_cover, however we should interpolate for the rest to fix any NaN

In [5]:
weather_cols = ['temperature', 'relative_humidity', 'precipitation', "dew_point",
                    'visibility', 'cloud_cover', 'is_rain', 'is_snow', 'is_fog', 'is_dark', "condition", "icon"]

df[weather_cols].isna().sum().sort_values(ascending=False)


cloud_cover          116941
visibility            14481
condition             10240
precipitation          7651
temperature            6403
relative_humidity      6403
dew_point              5981
is_rain                2016
is_snow                2016
is_fog                 2016
is_dark                2016
icon                   2016
dtype: int64

In [7]:
print(df["condition"].unique())
print(df["icon"].unique())

['dry' 'rain' 'snow' 'sleet' nan]
['cloudy' 'partly-cloudy-day' 'partly-cloudy-night' 'rain' 'snow' 'sleet'
 'clear-night' 'clear-day' nan]


In [6]:
# Collapse weather data to unique timestamp
df_weather = (
    df[['timestamp'] + weather_cols]
    .drop_duplicates(subset=['timestamp'])
    .sort_values('timestamp')
    .reset_index(drop=True)
)

print("Original df shape:", df.shape)
print("Collapsed weather shape:", df_weather.shape)


def detect_nan_streaks(series):
    is_nan = series.isna()
    streaks = []
    current_length = 0
    
    for nan in is_nan:
        if nan:
            current_length += 1
        else:
            if current_length > 0:
                streaks.append(current_length)
            current_length = 0

    # if ends with NaNs
    if current_length > 0:
        streaks.append(current_length)

    return streaks

streak_summary = {}

for col in weather_cols:
    streaks = detect_nan_streaks(df_weather[col])
    streak_summary[col] = {
        "num_gaps": len(streaks),
        "max_gap": max(streaks) if streaks else 0,
        "mean_gap": sum(streaks)/len(streaks) if streaks else 0,
        "streaks": streaks
    }

# Print summary
for col, stats in streak_summary.items():
    print(f"\n=== {col} ===")
    print(f"Number of gaps: {stats['num_gaps']}")
    print(f"Longest gap: {stats['max_gap']} hours")
    print(f"Mean gap size: {stats['mean_gap']:.2f}")


threshold = 12  # hours

for col in weather_cols:
    streaks = detect_nan_streaks(df_weather[col])
    if max(streaks) > threshold:
        print(f"\n>>> Long NaN gaps for {col}:")
        
        is_nan = df_weather[col].isna().values
        i = 0
        
        while i < len(is_nan):
            if is_nan[i]:
                start = i
                while i < len(is_nan) and is_nan[i]:
                    i += 1
                end = i - 1
                length = end - start + 1
                
                if length > threshold:
                    print(f"Start: {df_weather.loc[start, 'timestamp']}, "
                          f"End: {df_weather.loc[end, 'timestamp']}, "
                          f"Length: {length} hours")
            i += 1


Original df shape: (23385313, 39)
Collapsed weather shape: (81052, 13)

=== temperature ===
Number of gaps: 21
Longest gap: 2 hours
Mean gap size: 1.05

=== relative_humidity ===
Number of gaps: 21
Longest gap: 2 hours
Mean gap size: 1.05

=== precipitation ===
Number of gaps: 25
Longest gap: 1 hours
Mean gap size: 1.00

=== dew_point ===
Number of gaps: 20
Longest gap: 1 hours
Mean gap size: 1.00

=== visibility ===
Number of gaps: 53
Longest gap: 2 hours
Mean gap size: 1.38

=== cloud_cover ===
Number of gaps: 144
Longest gap: 60 hours
Mean gap size: 3.40

=== is_rain ===
Number of gaps: 7
Longest gap: 1 hours
Mean gap size: 1.00

=== is_snow ===
Number of gaps: 7
Longest gap: 1 hours
Mean gap size: 1.00

=== is_fog ===
Number of gaps: 7
Longest gap: 1 hours
Mean gap size: 1.00

=== is_dark ===
Number of gaps: 7
Longest gap: 1 hours
Mean gap size: 1.00

=== condition ===
Number of gaps: 28
Longest gap: 8 hours
Mean gap size: 1.57

=== icon ===
Number of gaps: 7
Longest gap: 1 hours
M

In [9]:
# Ensure timestamp is datetime
df = df.reset_index(drop=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Set timestamp as index (required for time interpolation)
df = df.set_index('timestamp')

### 1. Interpolate continuous weather variables ###
interp_cols = [
    "temperature", "precipitation", "dew_point", 
    "relative_humidity", "visibility"
]
df[interp_cols] = df[interp_cols].interpolate(method='time')

### 2. Fill categorical weather fields (cannot interpolate) ###
df['condition'] = df['condition'].ffill().bfill()
df['icon']      = df['icon'].ffill().bfill()

### 3. Recompute binary weather flags ###
df["is_rain"] = ((df["precipitation"] > 0) | df["condition"].str.contains("rain", case=False, na=False)).astype(int)
df["is_snow"] = df["condition"].str.contains("snow|sleet", case=False, na=False).astype(int)
df["is_fog"]  = (df["visibility"] < 1000).astype(int)

### 4. Create new cloud_cover estimate from icon ###

# Simple, clean mapping from icon → cloudiness level
cloud_map = {
    'clear-day':           0.0,
    'clear-night':         0.0,
    'partly-cloudy-day':   0.5,
    'partly-cloudy-night': 0.5,
    'cloudy':              1.0,
    'rain':                1.0,
    'snow':                1.0,
    'sleet':               1.0
}

df["cloud_cover"] = df["icon"].map(cloud_map).fillna(0.5)

### 5. Drop broken / unused columns ###
df = df.drop(columns=["is_dark", "any_nan", "nan_group"], errors='ignore')

# Restore timestamp as a normal column
df = df.reset_index()


In [10]:
df.drop(columns=[], inplace=True, errors='ignore')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23385313 entries, 0 to 23385312
Data columns (total 38 columns):
 #   Column             Dtype         
---  ------             -----         
 0   timestamp          datetime64[ns]
 1   detector_id        int64         
 2   hour               int64         
 3   quality            float64       
 4   veh_total          int64         
 5   speed_total        float64       
 6   veh_cars           int64         
 7   speed_cars         float64       
 8   veh_trucks         int64         
 9   speed_trucks       float64       
 10  year               int64         
 11  day_of_week        int64         
 12  is_weekend         int64         
 13  month              int64         
 14  season             object        
 15  is_holiday         int64         
 16  is_rush_hour       int64         
 17  is_school_holiday  int64         
 18  free_flow_speed    float64       
 19  speed_ratio        float64       
 20  congestion_index   flo

In [13]:
# Collapse weather data to unique timestamp
df_weather = (
    df
    .drop_duplicates(subset=['timestamp'])
    .sort_values('timestamp')
    .reset_index(drop=True)
)

print("Original df shape:", df.shape)
print("Collapsed weather shape:", df_weather.shape)


def detect_nan_streaks(series):
    is_nan = series.isna()
    streaks = []
    current_length = 0
    
    for nan in is_nan:
        if nan:
            current_length += 1
        else:
            if current_length > 0:
                streaks.append(current_length)
            current_length = 0

    # if ends with NaNs
    if current_length > 0:
        streaks.append(current_length)

    return streaks

streak_summary = {}

for col in df.columns:
    streaks = detect_nan_streaks(df_weather[col])
    streak_summary[col] = {
        "num_gaps": len(streaks),
        "max_gap": max(streaks) if streaks else 0,
        "mean_gap": sum(streaks)/len(streaks) if streaks else 0,
        "streaks": streaks
    }

# Print summary
for col, stats in streak_summary.items():
    print(f"\n=== {col} ===")
    print(f"Number of gaps: {stats['num_gaps']}")
    print(f"Longest gap: {stats['max_gap']} hours")
    print(f"Mean gap size: {stats['mean_gap']:.2f}")


threshold = 12  # hours

for col in df.columns:
    streaks = detect_nan_streaks(df_weather[col])
    if len(streaks) > 0 and max(streaks) > threshold:
        print(f"\n>>> Long NaN gaps for {col}:")
        
        is_nan = df_weather[col].isna().values
        i = 0
        
        while i < len(is_nan):
            if is_nan[i]:
                start = i
                while i < len(is_nan) and is_nan[i]:
                    i += 1
                end = i - 1
                length = end - start + 1
                
                if length > threshold:
                    print(f"Start: {df_weather.loc[start, 'timestamp']}, "
                          f"End: {df_weather.loc[end, 'timestamp']}, "
                          f"Length: {length} hours")
            i += 1


Original df shape: (23385313, 38)
Collapsed weather shape: (81052, 38)

=== timestamp ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== detector_id ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== hour ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== quality ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== veh_total ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== speed_total ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== veh_cars ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== speed_cars ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== veh_trucks ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== speed_trucks ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== year ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 0.00

=== day_of_week ===
Number of gaps: 0
Longest gap: 0 hours
Mean gap size: 

In [14]:
df.to_csv('prepared_data/preprocessed_full_data.csv', index=False)