 """
# Abu Dhabi Traffic Flow Vehicle Behavior Classification Project

## Research Objective:
Train a CNN-LSTM hybrid model using actual vehicle labels from Aimsun data:
- HDV Aggressive → Aggressive behavior
- HDV Conventional Gipps Model → Normal behavior
- HDV Cooperative → Cooperative behavior
- CAV → Autonomous vehicle (excluded from training)

## Model Validation:
Compare CNN-LSTM predictions with actual vehicle labels to evaluate accuracy and F1 scores. Split the data into 80% training and 20% testing. Get the data from BOX using BOX API
"""
print("🚗 Abu Dhabi Traffic Flow Vehicle Behavior Classification System")
print("📊 Training with Actual Vehicle Labels from Data Files")


CHeck the GPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jul 15 08:51:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   68C    P0             31W /   70W |     250MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


flowchart TD

    A[Raw CSV Data] --> B[Data Parsing & Cleaning]
    B --> C[Feature Extraction]
    C --> D[Data Preparation]
    D --> E[Model Training (CNN-LSTM)]
    E --> F[Prediction & Continuous Learning]
    F --> G[Behavior Output: Aggressive/Cooperative/Normal]


In [None]:
!pip install --upgrade tensorflow




In [None]:
!pip install boxsdk



In [None]:
!pip install "boxsdk[jwt]"



In [None]:
import os, json, numpy as np, pandas as pd, warnings
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (Input, LSTM, Conv1D, Dense, Dropout, BatchNormalization, Concatenate, GlobalMaxPooling1D, Masking)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import matplotlib.pyplot as plt
import io
import json
import hashlib
from tensorflow.keras.layers import Lambda
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from boxsdk import Client, JWTAuth
import pandas as pd
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences

auth = JWTAuth.from_settings_file('key.json')
client = Client(auth)





warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)
print("✅ All libraries imported successfully")


✅ All libraries imported successfully


Data Processing functions

In [None]:
def parse_array_string(array_str):
    """Parse array strings with error handling"""
    try:
        if pd.isna(array_str): return []
        cleaned = str(array_str).replace('inf', '0').replace('-inf', '0').replace('nan', 'None')
        return eval(cleaned)
    except:
        return []

def interpolate_missing_values(sequence):
    """Handle missing values using linear interpolation"""
    """
    Interpolate only None or np.nan values in the sequence.
    Leave -inf and +inf unchanged.
    """
    if not sequence:
        return []
    arr = np.array(sequence, dtype=float)
    # Identify missing values (None or np.nan)
    missing_mask = np.isnan(arr)
    # Valid values are finite and not missing
    valid_mask = ~missing_mask & np.isfinite(arr)
    valid_indices = np.where(valid_mask)[0]

    # If all are missing, return zeros except for -inf/+inf
    if np.all(missing_mask | ~np.isfinite(arr)):
        return [v if np.isinf(v) else 0.0 for v in arr]

    # If only one valid value, fill missing with that value (but keep infs)
    if len(valid_indices) == 1:
        fill_value = arr[valid_indices[0]]
        return [
            v if not np.isnan(v) else fill_value
            for v in arr
        ]

    # Standard case: interpolate only missing values, leave infs untouched
    interp_values = np.copy(arr)
    # Indices to interpolate: missing and not inf
    interp_indices = np.where(missing_mask & ~np.isinf(arr))[0]
    if len(valid_indices) >= 2 and len(interp_indices) > 0:
        # Interpolate only where needed
        interp_result = np.interp(
            interp_indices, valid_indices, arr[valid_mask]
        )
        interp_values[interp_indices] = interp_result

    # Convert to list and return
    return interp_values.tolist()

def parse_coordinate_string(coord_str):
    """Parse coordinate data with interpolation"""
    try:
        if pd.isna(coord_str): return []
        cleaned_str = str(coord_str).replace('inf', '0').replace('-inf', '0').replace('nan', 'None')
        coords = eval(cleaned_str)
        x_coords = [c[0] for c in coords]
        y_coords = [c[1] for c in coords]
        return list(zip(
            interpolate_missing_values(x_coords),
            interpolate_missing_values(y_coords)
        ))
    except:
        return []

def calculate_lateral_speeds(front_coords, rear_coords):
    """Calculate lateral movement speeds"""
    if len(front_coords) < 2: return []
    lateral_speeds = []
    for i in range(1, len(front_coords)):
        try:
            dx = front_coords[i][0] - front_coords[i-1][0]
            dy = front_coords[i][1] - front_coords[i-1][1]
            lateral_speeds.append(np.sqrt(dx**2 + dy**2))
        except:
            lateral_speeds.append(0)
    return interpolate_missing_values(lateral_speeds)


In [None]:
def extract_vehicle_labels(df):
    """Extract labels directly from VehTypeName column"""
    def map_vehicle_type(veh_type_name):
        veh_type = str(veh_type_name).strip()
        if 'CAV' in veh_type:
            return 'autonomous'  # Will be excluded from training
        elif 'Aggressive' in veh_type:
            return 'aggressive'
        elif 'Cooperative' in veh_type:
            return 'cooperative'
        elif 'Conventional' in veh_type or 'Gipps' in veh_type:
            return 'normal'
        else:
            return 'normal'  # Default classification

    df['behavior_label'] = df['VehTypeName'].apply(map_vehicle_type)
    return df

Feature extraction and if any record is more than half empty then it is discarded

In [None]:
def extract_speed_position_features(vehicle_row, min_sequence_length=5):
    """
    Extract only speed and position sequences from vehicle data row.
    Returns None if data is insufficient.
    """
    try:
        # Parse speed sequence
        speeds = parse_array_string(vehicle_row['Speeds'])
        # Parse front coordinates (positions)
        front_coords = parse_coordinate_string(vehicle_row['VehFrontCoords'])

        # Validation: Check for at least 30 invalid values in any sequence
        def count_invalid(seq):
            return sum(
                (v is None) or
                (isinstance(v, float) and (np.isnan(v) or np.isinf(v)))
                for v in seq
            )

        if count_invalid(speeds) >= 30 or count_invalid(front_coords) >= 30:
            return None

        # Clean sequences
        speeds_clean = interpolate_missing_values(speeds)
        positions_clean = front_coords  # Already interpolated by parse_coordinate_string

        # Ensure consistent length
        min_length = min(len(speeds_clean), len(positions_clean))
        if min_length < min_sequence_length:
            return None

        # Truncate to same length
        speeds_clean = speeds_clean[:min_length]
        positions_clean = positions_clean[:min_length]

        # Convert to arrays
        speeds_arr = np.array(speeds_clean).reshape(-1, 1)
        positions_arr = np.array(positions_clean).reshape(-1, 2)

        return {
            'speeds': speeds_arr,          # shape (N, 1)
            'positions': positions_arr,    # shape (N, 2)
            'sequence_length': min_length
        }
    except Exception as e:
        print(f"Feature extraction error: {e}")
        return None


Data reading function

In [None]:
def read_aimsun_data(file_path, column_map=None):
    """Read CSV with flexible column mapping"""
    DEFAULT_MAP = {
        'vehicle_id': 'VehNr', 'timestep': 'Timestep', 'speed': 'Speeds',
        'acceleration': 'Accelerations', 'front_coords': 'VehFrontCoords',
        'rear_coords': 'VehRearCoords', 'vehicle_type': 'VehTypeName', 'length': 'Length'
    }
    col_map = column_map or DEFAULT_MAP
    try:
        df = pd.read_csv(file_path)
        return df.rename(columns={v: k for k, v in col_map.items()})
    except Exception as e:
        print(f"Error reading {file_path}: {str(e)}")
        return None

def extract_vehicle_table(df, vehicle_id):
    """Extract time-series table for specific vehicle"""
    vehicle_data = df[df['vehicle_id'] == vehicle_id].copy()
    vehicle_data['front_coords'] = vehicle_data['front_coords'].apply(parse_coordinate_string)
    vehicle_data['rear_coords'] = vehicle_data['rear_coords'].apply(parse_coordinate_string)
    vehicle_data['position'] = vehicle_data.apply(
        lambda x: np.mean([x['front_coords'], x['rear_coords']], axis=0), axis=1)
    return vehicle_data[['timestep', 'speed', 'acceleration', 'position']]

print("✅ Data reading functions ready")


✅ Data reading functions ready


In [None]:
def process_labeled_data_speed_position(csv_files, folder_path, max_files=None):
    """
    Process data files using only speed and position features.
    """
    all_features = []
    all_labels = []
    vehicle_details = []
    processed_count = 0

    files_to_process = csv_files[:max_files] if max_files else csv_files

    for filename in files_to_process:
        try:
            print(f"📄 Processing {filename}...")
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)

            # Extract labels from VehTypeName
            df = extract_vehicle_labels(df)

            print(f"   Labels in {filename}:")
            file_labels = df['behavior_label'].value_counts()
            for label, count in file_labels.items():
                print(f"     {label}: {count}")

            for idx, row in df.iterrows():
                if row['behavior_label'] == 'autonomous':
                    continue

                features = extract_speed_position_features(row)
                if features is not None:
                    all_features.append(features)
                    all_labels.append(row['behavior_label'])
                    vehicle_details.append({
                        'VehNr': row['VehNr'],
                        'VehTypeName': row['VehTypeName'],
                        'actual_label': row['behavior_label'],
                        'file': filename
                    })
                    processed_count += 1

            print(f"   ✅ Extracted {processed_count} valid vehicles so far")
        except Exception as e:
            print(f"   ❌ Error: {e}")

    return all_features, all_labels, vehicle_details


In [None]:
def prepare_speed_position_data(features_list, max_sequence=60):
    """
    Prepare padded speed and position arrays for model input.
    Returns:
        X_speed: (num_samples, max_sequence, 1)
        X_pos: (num_samples, max_sequence, 2)
    """
    X_speed = []
    X_pos = []
    for features in features_list:
        speeds = features['speeds'][:max_sequence]
        positions = features['positions'][:max_sequence]
        seq_len = min(len(speeds), max_sequence)

        # Pad if needed
        speed_padded = np.zeros((max_sequence, 1))
        pos_padded = np.zeros((max_sequence, 2))
        speed_padded[:seq_len] = speeds[:seq_len]
        pos_padded[:seq_len] = positions[:seq_len]

        X_speed.append(speed_padded)
        X_pos.append(pos_padded)
    return np.array(X_speed), np.array(X_pos)


In [None]:
def build_speed_position_model(sequence_length=60, num_classes=3):
    """
    Build a model that receives speed and position sequences,
    computes acceleration and lateral movement internally,
    and predicts vehicle behavior.
    """
    # Inputs
    speed_input = Input(shape=(sequence_length, 1), name='speed')
    position_input = Input(shape=(sequence_length, 2), name='position')

    # Compute acceleration (difference between consecutive speeds)
    def compute_accel(x):
        return tf.concat([tf.zeros_like(x[:, :1, :]), x[:, 1:, :] - x[:, :-1, :]], axis=1)
    accel = Lambda(compute_accel, name='acceleration')(speed_input)

    # Compute lateral movement (Euclidean distance between consecutive positions)
    def compute_lateral(pos):
        delta = pos[:, 1:, :] - pos[:, :-1, :]
        lateral = tf.concat([tf.zeros_like(delta[:, :1, :]), delta], axis=1)
        return tf.norm(lateral, axis=-1, keepdims=True)
    lateral = Lambda(compute_lateral, name='lateral')(position_input)

    # Concatenate all features
    features = Concatenate(axis=-1)([speed_input, accel, lateral])  # shape: (batch, seq, 3)

    # Masking for variable-length sequences
    masked = Masking(mask_value=0.0)(features)

    # CNN branch
    cnn = Conv1D(64, 3, activation='relu')(masked)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.3)(cnn)
    cnn = Conv1D(128, 3, activation='relu')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn_out = GlobalMaxPooling1D()(cnn)

    # LSTM branch
    lstm = LSTM(128, return_sequences=True, unroll=True, use_cudnn=False)(masked)
    lstm = BatchNormalization()(lstm)
    lstm_out = LSTM(64)(lstm)

    # Feature fusion
    combined = Concatenate()([cnn_out, lstm_out])
    x = Dense(256, activation='relu')(combined)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    output = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=[speed_input, position_input], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("🧠 Model Architecture:")
    model.summary()
    return model

graph TD

    TS[Time Series Input<br/>60 x 3] --> Mask[Masking Layer]
    Mask --> CNN1[Conv1D 64<br/>Kernel=3]
    CNN1 --> BN1[BatchNorm]
    BN1 --> Drop1[Dropout 0.3]
    Drop1 --> CNN2[Conv1D 128<br/>Kernel=3]
    CNN2 --> BN2[BatchNorm]
    BN2 --> GMP[GlobalMaxPooling1D]
    
    Mask --> LSTM1[LSTM 128<br/>return_sequences=True]
    LSTM1 --> BN3[BatchNorm]
    BN3 --> LSTM2[LSTM 64]
    
    S[Static Features<br/>13 dims] --> Dense1[Dense 64]
    Dense1 --> BN4[BatchNorm]
    
    GMP --> Concat[Concatenate]
    LSTM2 --> Concat
    BN4 --> Concat
    
    Concat --> Dense2[Dense 256]
    Dense2 --> BN5[BatchNorm]
    BN5 --> Drop2[Dropout 0.5]
    Drop2 --> Dense3[Dense 128]
    Dense3 --> Output[Softmax Output<br/>3 Classes]


Calculate the F1 score

In [None]:
def classify_vehicle_behavior_speed_position(vehicle_data, model, max_sequence=60):
    """
    Classify vehicle behavior using only speed and position sequences.
    """
    features = extract_speed_position_features(vehicle_data)
    if not features:
        return "unknown"

    speeds = features['speeds'][:max_sequence]
    positions = features['positions'][:max_sequence]
    seq_len = min(len(speeds), max_sequence)

    # Pad
    speed_padded = np.zeros((1, max_sequence, 1))
    pos_padded = np.zeros((1, max_sequence, 2))
    speed_padded[0, :seq_len] = speeds
    pos_padded[0, :seq_len] = positions

    prediction = model.predict([speed_padded, pos_padded])
    class_id = np.argmax(prediction)
    return {0: 'aggressive', 1: 'cooperative', 2: 'normal'}.get(class_id, 'unknown')


In [None]:
def analyze_f1_performance(comparisons, actual_labels, predictions):
    """Detailed F1 score analysis with visualizations"""

    # Classification report
    print("\n📋 Detailed Classification Report:")
    print("=" * 60)
    print(classification_report(actual_labels, predictions))

    # F1 scores by confidence ranges
    df_comp = pd.DataFrame(comparisons)
    confidence_ranges = [(0.0, 0.5), (0.5, 0.7), (0.7, 0.9), (0.9, 1.0)]

    print("\n📊 F1 Score by Confidence Ranges:")
    print("-" * 50)
    for low, high in confidence_ranges:
        mask = (df_comp['confidence'] >= low) & (df_comp['confidence'] < high)
        subset = df_comp[mask]
        if len(subset) > 0:
            subset_f1 = f1_score(subset['actual'], subset['predicted'], average='macro')
            print(f"Confidence {low:.1f}-{high:.1f}: F1={subset_f1:.3f} (n={len(subset)})")

    # Misclassified vehicles analysis
    print(f"\n❌ Misclassified Vehicles Analysis:")
    misclassified = df_comp[df_comp['correct'] == False]
    if not misclassified.empty:
        print(f"Total misclassified: {len(misclassified)}")
        for behavior in ['aggressive', 'cooperative', 'normal']:
            behavior_miss = misclassified[misclassified['actual'] == behavior]
            if len(behavior_miss) > 0:
                print(f"  {behavior}: {len(behavior_miss)} vehicles")

def plot_f1_results(comparisons, actual_labels, predictions):
    """Visualize F1 score results"""

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. F1 scores by class
    classes = sorted(set(actual_labels + predictions))
    if len(classes) > 1:
        f1_per_class = f1_score(actual_labels, predictions, average=None, labels=classes)
        axes[0,0].bar(classes, f1_per_class, color=['#ff9999','#66b3ff','#99ff99'])
        axes[0,0].set_title('F1 Score by Vehicle Behavior Class')
        axes[0,0].set_ylabel('F1 Score')
        axes[0,0].set_ylim(0, 1.1)

    # 2. Prediction confidence distribution
    df_comp = pd.DataFrame(comparisons)
    axes[0,1].hist([df_comp[df_comp['correct']]['confidence'],
                   df_comp[~df_comp['correct']]['confidence']],
                  bins=20, alpha=0.7, label=['Correct', 'Incorrect'])
    axes[0,1].set_title('Prediction Confidence Distribution')
    axes[0,1].set_xlabel('Confidence')
    axes[0,1].legend()

    # 3. Confusion matrix
    if len(classes) > 1:
        from sklearn.metrics import confusion_matrix
        import seaborn as sns
        cm = confusion_matrix(actual_labels, predictions, labels=classes)
        sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, ax=axes[1,0])
        axes[1,0].set_title('Confusion Matrix')
        axes[1,0].set_xlabel('Predicted')
        axes[1,0].set_ylabel('Actual')

    # 4. F1 vs Accuracy comparison
    accuracy = df_comp['correct'].mean()
    f1_macro = f1_score(actual_labels, predictions, average='macro') if len(classes) > 1 else accuracy

    metrics = ['Accuracy', 'F1-Macro']
    values = [accuracy, f1_macro]
    axes[1,1].bar(metrics, values, color=['lightblue', 'lightcoral'])
    axes[1,1].set_title('Accuracy vs F1 Score')
    axes[1,1].set_ylabel('Score')
    axes[1,1].set_ylim(0, 1.1)

    plt.tight_layout()
    plt.show()

print("✅ F1 analysis and visualization functions ready")


✅ F1 analysis and visualization functions ready


In [None]:
def compare_predictions_with_labels_f1_speed_position(model, features_list, vehicle_details, label_encoder, max_sequence=60):
    """
    Compare model predictions with actual vehicle labels using F1 score.
    """
    predictions = []
    actual_labels = []
    comparisons = []

    print("🔍 Comparing predictions with actual labels (F1 Score Evaluation)...")
    print("-" * 80)
    print(f"{'VehNr':<8} {'Actual Label':<15} {'Predicted':<15} {'Match':<8} {'Confidence':<12}")
    print("-" * 80)

    correct_predictions = 0

    X_speed, X_pos = prepare_speed_position_data(features_list, max_sequence=max_sequence)

    for i, vehicle_info in enumerate(vehicle_details):
        # Predict
        prediction_probs = model.predict([X_speed[i:i+1], X_pos[i:i+1]], verbose=0)
        predicted_class_idx = np.argmax(prediction_probs)
        predicted_label = label_encoder.inverse_transform([predicted_class_idx])[0]
        confidence = np.max(prediction_probs)
        actual_label = vehicle_info['actual_label']
        is_correct = predicted_label == actual_label

        if is_correct:
            correct_predictions += 1

        predictions.append(predicted_label)
        actual_labels.append(actual_label)
        comparison = {
            'VehNr': vehicle_info['VehNr'],
            'actual': actual_label,
            'predicted': predicted_label,
            'correct': is_correct,
            'confidence': confidence,
            'file': vehicle_info['file']
        }
        comparisons.append(comparison)

        if i < 20:
            match_symbol = "✅" if is_correct else "❌"
            print(f"{vehicle_info['VehNr']:<8} {actual_label:<15} {predicted_label:<15} {match_symbol:<8} {confidence:.3f}")

    # Calculate F1 scores
    try:
        f1_macro = f1_score(actual_labels, predictions, average='macro')
        f1_weighted = f1_score(actual_labels, predictions, average='weighted')
        f1_per_class = f1_score(actual_labels, predictions, average=None, labels=label_encoder.classes_)
        precision_macro = precision_score(actual_labels, predictions, average='macro')
        recall_macro = recall_score(actual_labels, predictions, average='macro')
        accuracy = correct_predictions / len(vehicle_details)

        print("-" * 80)
        print(f"🎯 Overall Accuracy: {accuracy:.3f} ({correct_predictions}/{len(vehicle_details)})")
        print(f"📊 F1 Score (Macro): {f1_macro:.3f}")
        print(f"📊 F1 Score (Weighted): {f1_weighted:.3f}")
        print(f"📊 Precision (Macro): {precision_macro:.3f}")
        print(f"📊 Recall (Macro): {recall_macro:.3f}")

        print("\n🏷️ F1 Score per Class:")
        for i, class_name in enumerate(label_encoder.classes_):
            if i < len(f1_per_class):
                print(f"  {class_name:15s}: {f1_per_class[i]:.3f}")
    except Exception as e:
        print(f"⚠️ Could not calculate F1 scores: {e}")
        f1_macro = f1_weighted = 0.0

    return comparisons, accuracy, f1_macro, f1_weighted


In [None]:
import io
import json
import os
import hashlib

class ContinuousLearningSystem:

    def __init__(self, model_path, scaler_path, le_path, data_path, processed_path, box_client=None, box_folder=None):
        self.model_path = model_path
        self.scaler_path = scaler_path
        self.le_path = le_path
        self.data_path = data_path
        self.processed_path = processed_path
        self.processed_hashes = self.load_processed_hashes()
        self.box_client = box_client
        self.box_folder = box_folder

        # Load existing model artifacts if they exist
        self.model = self.load_model()
        self.scaler = self.load_scaler()
        self.label_encoder = self.load_label_encoder()


    def load_processed_hashes(self):
        try:
            if os.path.exists(self.processed_path):
                with open(self.processed_path, 'r') as f:
                    return set(json.load(f))
            else:
                return set()
        except Exception as e:
            print(f"Error loading processed hashes: {e}")
            return set()
    def load_model(self):
        try:
            if os.path.exists(self.model_path):
                print(f"Loading existing model from {self.model_path}")
                return load_model(self.model_path)
            else:
                print("No existing model found. A new model will be built.")
                return None
        except Exception as e:
            print(f"Error loading model from {self.model_path}: {e}")
            return None

    def load_scaler(self):
        try:
            if os.path.exists(self.scaler_path):
                print(f"Loading existing scaler from {self.scaler_path}")
                return joblib.load(self.scaler_path)
            else:
                print("No existing scaler found.")
                return None
        except Exception as e:
            print(f"Error loading scaler from {self.scaler_path}: {e}")
            return None

    def load_label_encoder(self):
        try:
            if os.path.exists(self.le_path):
                print(f"Loading existing label encoder from {self.le_path}")
                return joblib.load(self.le_path)
            else:
                print("No existing label encoder found.")
                return None
        except Exception as e:
            print(f"Error loading label encoder from {self.le_path}: {e}")
            return None



    def save_processed_hashes(self):
        try:
            with open(self.processed_path, 'w') as f:
                json.dump(list(self.processed_hashes), f)
        except Exception as e:
            print(f"Error saving processed hashes: {e}")

    def csv_stream_hash(self, csv_stream):
        pos = csv_stream.tell()
        csv_stream.seek(0)
        content = csv_stream.read()
        csv_stream.seek(pos)  # Restore position
        return hashlib.sha256(content.encode('utf-8')).hexdigest()

    def stream_all_csv_files_from_box(self, folder):
        """Yield (file_name, csv_stream) for every CSV file in Box folder and subfolders."""
        for item in folder.get_items(limit=2000):
            if item.type == 'folder':
                yield from self.stream_all_csv_files_from_box(self.box_client.folder(item.id))
            elif item.type == 'file' and item.name.endswith('.csv'):
                file_content = item.content()
                csv_stream = io.StringIO(file_content.decode('utf-8'))
                yield item.name, csv_stream

    def get_new_box_files(self):
        """Get new CSV files from Box that have not been processed yet."""
        new_files = []
        if not self.box_client or not self.box_folder:
            print("Box client or folder not configured.")
            return new_files

        for file_name, csv_stream in self.stream_all_csv_files_from_box(self.box_folder):
            file_hash = self.csv_stream_hash(csv_stream)
            if file_hash not in self.processed_hashes:
                new_files.append((file_name, csv_stream, file_hash))
        return new_files


    def update_model(self):
        model = self.model
        scaler = self.scaler
        label_encoder = self.label_encoder

        new_files = self.get_new_box_files()

        # Process training data
        train_features, train_labels, _ = self.process_labeled_data_streams(train_files)
        # Process testing data
        test_features, test_labels, _ = self.process_labeled_data_streams(test_files)
        if not new_files:
            print("No new files to process from Box")
            return model, scaler, label_encoder

        new_features, new_labels, _ = self.process_labeled_data_streams(new_files)

        if not new_features:
            print("No valid features extracted from new Box files")
            return model, scaler, label_encoder

        if scaler is None:
            print("Fitting new scaler on new data")
            static_features_list = [f['static'] for f in new_features]
            scaler = StandardScaler()
            scaler.fit(static_features_list)
            self.scaler = scaler

        if label_encoder is None:
            print("Fitting new label encoder on new data")
            label_encoder = LabelEncoder()
            label_encoder.fit(new_labels)
            self.label_encoder = label_encoder

        X_ts_new, X_static_new, y_new = self.prepare_new_data(new_features, new_labels, scaler, label_encoder)

        if model is None:
            print("Building a new model for initial training.")
            num_features = X_ts_new.shape[-1] if X_ts_new.shape[-1] > 0 else 3
            static_feature_count = X_static_new.shape[-1] if X_static_new.shape[-1] > 0 else 13
            num_classes = len(label_encoder.classes_) if label_encoder else 3
            model = build_speed_position_model(sequence_length=X_ts_new.shape[1], num_features=num_features, static_feature_count=static_feature_count, num_classes=num_classes)
            self.model = model

        print(f"Training model with {len(new_features)} new samples")
        self.model.fit([X_ts_new, X_static_new], y_new, epochs=3, batch_size=32, validation_split=0.1)

        self.save_model()
        self.save_scaler()
        self.save_label_encoder()

        for _, _, fhash in new_files:
            self.processed_hashes.add(fhash)
        self.save_processed_hashes()

        print(f"Updated model with {len(new_features)} new samples")
        return self.model, self.scaler, self.label_encoder

    def process_labeled_data_streams(self, file_streams):
        """Process labeled data from a list of (file_name, csv_stream) tuples."""
        all_features = []
        all_labels = []
        vehicle_details = []
        processed_count = 0

        for file_name, csv_stream, _ in file_streams:
            try:
                print(f"📄 Processing {file_name} from Box stream...")
                df = pd.read_csv(csv_stream)
                df = extract_vehicle_labels(df)
                print(f"   Labels in {file_name}:")
                file_labels = df['behavior_label'].value_counts()
                for label, count in file_labels.items():
                    print(f"     {label}: {count}")
                for idx, row in df.iterrows():
                    if row['behavior_label'] == 'autonomous':
                        continue
                    features = extract_speed_position_features(row)
                    if features is not None:
                        all_features.append(features)
                        all_labels.append(row['behavior_label'])
                        vehicle_details.append({
                            'VehNr': row['VehNr'],
                            'VehTypeName': row['VehTypeName'],
                            'actual_label': row['behavior_label'],
                            'file': file_name
                        })
                        processed_count += 1
                print(f"   ✅ Extracted {processed_count} valid vehicles so far")
            except Exception as e:
                print(f"   ❌ Error processing {file_name}: {e}")

        return all_features, all_labels, vehicle_details


    def save_model(self):
        try:
            if self.model:
                self.model.save(self.model_path)
                print(f"Model saved to {self.model_path}")
        except Exception as e:
            print(f"Error saving model to {self.model_path}: {e}")

    def save_scaler(self):
        try:
            if self.scaler:
                joblib.dump(self.scaler, self.scaler_path)
                print(f"Scaler saved to {self.scaler_path}")
        except Exception as e:
            print(f"Error saving scaler to {self.scaler_path}: {e}")

    def save_label_encoder(self):
        try:
            if self.label_encoder:
                joblib.dump(self.label_encoder, self.le_path)
                print(f"Label encoder saved to {self.le_path}")
        except Exception as e:
            print(f"Error saving label encoder to {self.le_path}: {e}")

    def load_all_data_from_box(self):
        """
        Load all vehicle data from Box into a single list.
        Each item: {'features': ..., 'label': ..., 'details': {...}}
        """
        all_data = []
        file_counter = 0

        for file_name, csv_stream in self.stream_all_csv_files_from_box(self.box_folder):
            fhash = self.csv_stream_hash(csv_stream)
            if fhash in self.processed_hashes:
                continue

            try:
                csv_stream.seek(0)
                df = pd.read_csv(csv_stream)
                df = extract_vehicle_labels(df)

                for _, row in df.iterrows():
                    if row['behavior_label'] == 'autonomous':
                        continue
                    features = extract_speed_position_features(row)
                    if features is not None:
                        record = {
                            'features': features,
                            'label': row['behavior_label'],
                            'details': {
                                'VehNr': row['VehNr'],
                                'VehTypeName': row['VehTypeName'],
                                'file': file_name
                            }
                        }
                        all_data.append(record)
                self.processed_hashes.add(fhash)
                file_counter += 1
                if file_counter == 50: return all_data
                if file_counter % 10 == 0:
                    print(f"Files loaded: {file_counter}")
                    self.save_processed_hashes()
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

        self.save_processed_hashes()
        return all_data


Extract the data from the folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = '/content/drive/MyDrive/Test_Output'
MODEL_PATH = f'{DRIVE_PATH}/traffic_model.keras'
SCALER_PATH = f'{DRIVE_PATH}/scaler.joblib'
LE_PATH = f'{DRIVE_PATH}/label_encoder.joblib'
PROCESSED_PATH = f'{DRIVE_PATH}/processed.json'

os.makedirs(DRIVE_PATH, exist_ok=True)

print("✅ All output and data will be stored in:", DRIVE_PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ All output and data will be stored in: /content/drive/MyDrive/Test_Output


In [None]:
main_folder_id = '326383492292'
main_folder = client.folder(main_folder_id).get()

# Find the "Parsed Time Series Data" subfolder
parsed_folder = None
for item in main_folder.get_items(limit=100):
    if item.type == 'folder' and item.name == 'Parsed Time Series Data':
        parsed_folder = client.folder(item.id)
        break

if not parsed_folder:
    raise Exception("Parsed Time Series Data folder not found.")

In [None]:
def stream_all_csv_files_from_box(folder, client):
    """
    Recursively yield (file_name, csv_stream) for every CSV in all subfolders.
    """
    for item in folder.get_items(limit = None):
        if item.type == 'folder':
            yield from stream_all_csv_files_from_box(client.folder(item.id), client)
        elif item.type == 'file' and item.name.endswith('.csv'):
            file_content = item.content()
            csv_stream = io.StringIO(file_content.decode('utf-8'))
            yield item.name, csv_stream


In [None]:
def split_train_test(all_data, train_ratio=0.8, seed=42):
    """
    Split data into train and test sets by ratio.
    """
    random.seed(seed)
    random.shuffle(all_data)
    split_idx = int(len(all_data) * train_ratio)
    train_data = all_data[:split_idx]
    test_data = all_data[split_idx:]
    return train_data, test_data

In [None]:
def unpack_data(data_list):
    features = [item['features'] for item in data_list]
    labels = [item['label'] for item in data_list]
    details = [item['details'] for item in data_list]
    return features, labels, details


In [None]:
cl_system = ContinuousLearningSystem(MODEL_PATH, SCALER_PATH, LE_PATH, "", PROCESSED_PATH, client, parsed_folder)


# 1. Load all data
all_data = cl_system.load_all_data_from_box()
# 2. Split into train and test sets
train_data, test_data = split_train_test(all_data, train_ratio=0.8)

# 3. Unpack features and labels
train_features, train_labels, train_details = unpack_data(train_data)
test_features, test_labels, test_details = unpack_data(test_data)


max_sequence = 60

# Extract raw sequences
speed_sequences = [f['speeds'].squeeze() for f in train_features]
position_sequences = [f['positions'] for f in train_features]

# Pad speed sequences
X_speed_train = pad_sequences(speed_sequences, maxlen=max_sequence, padding='post', dtype='float32')
X_speed_train = np.expand_dims(X_speed_train, axis=-1)  # shape: (num_samples, max_sequence, 1)

# Pad position sequences (pad each coordinate separately)
X_pos_train_x = pad_sequences([p[:, 0] for p in position_sequences], maxlen=max_sequence, padding='post', dtype='float32')
X_pos_train_y = pad_sequences([p[:, 1] for p in position_sequences], maxlen=max_sequence, padding='post', dtype='float32')
X_pos_train = np.stack([X_pos_train_x, X_pos_train_y], axis=-1)  # shape: (num_samples, max_sequence, 2)


if cl_system.label_encoder is None:
    print("Fitting new label encoder on training data")
    cl_system.label_encoder = LabelEncoder().fit(train_labels)

# 3. Prepare model input arrays
#X_speed_train, X_pos_train = prepare_speed_position_data(train_features)
y_train = cl_system.label_encoder.transform(train_labels)

# 4. Build the model if needed
if cl_system.model is None:
    print("Building a new model for initial training.")
    sequence_length = X_speed_train.shape[1]
    num_classes = len(cl_system.label_encoder.classes_)
    cl_system.model = build_speed_position_model(
        sequence_length=sequence_length,
        num_classes=num_classes
    )

# 5. Train the model
print(f"Training model with {len(train_features)} samples")
cl_system.model.fit(
    [X_speed_train, X_pos_train], y_train,
    epochs=3, batch_size=32, validation_split=0.2
)


print("Saving model and artifacts to Google Drive...")
cl_system.save_model()
cl_system.save_scaler()
cl_system.save_label_encoder()
print("✅ Model, scaler, and label encoder updated and saved in:", DRIVE_PATH)

No existing model found. A new model will be built.
No existing scaler found.
No existing label encoder found.
Files loaded: 10
Files loaded: 20
Files loaded: 30
Files loaded: 40
Fitting new label encoder on training data
Building a new model for initial training.
🧠 Model Architecture:


Training model with 13172 samples
Epoch 1/3


InvalidArgumentError: Graph execution error:

Detected at node functional_7_1/lstm_15_1/Assert/Assert defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipython-input-96-2323280157.py", line 50, in <cell line: 0>

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 57, in train_step

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/layers/layer.py", line 908, in __call__

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/models/functional.py", line 182, in call

  File "/usr/local/lib/python3.11/dist-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/usr/local/lib/python3.11/dist-packages/keras/src/models/functional.py", line 637, in call

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/layers/layer.py", line 908, in __call__

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/layers/rnn/lstm.py", line 584, in call

  File "/usr/local/lib/python3.11/dist-packages/keras/src/layers/rnn/rnn.py", line 402, in call

  File "/usr/local/lib/python3.11/dist-packages/keras/src/layers/rnn/lstm.py", line 551, in inner_loop

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/rnn.py", line 841, in lstm

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/rnn.py", line 874, in _cudnn_lstm

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/rnn.py", line 557, in _assert_valid_mask

assertion failed: [You are passing a RNN mask that does not correspond to right-padded sequences, while using cuDNN, which is not supported. With cuDNN, RNN masks can only be used for right-padding, e.g. `[[True, True, False, False]]` would be a valid mask, but any mask that isn\'t just contiguous `True`\'s on the left and contiguous `False`\'s on the right would be invalid. You can pass `use_cudnn=False` to your RNN layer to stop using cuDNN (this may be slower).]
	 [[{{node functional_7_1/lstm_15_1/Assert/Assert}}]] [Op:__inference_multi_step_on_iterator_182392]

In [None]:
X_speed_test, X_pos_test = prepare_speed_position_data(test_features)
y_test = cl_system.label_encoder.transform(test_labels)


In [None]:
print(cl_system.model.input_shape)


In [None]:
y_pred_probs = cl_system.model.predict([X_speed_test, X_pos_test])
y_pred = np.argmax(y_pred_probs, axis=1)

from sklearn.metrics import f1_score, classification_report

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print("F1 Score (macro):", f1_macro)
print("F1 Score (weighted):", f1_weighted)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=cl_system.label_encoder.classes_))
