# Setting variables



In [None]:
import pandas as pd
import warnings
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from geopy.distance import geodesic
import os
from lightgbm import LGBMRegressor
from collections import deque



ais_train = pd.read_csv('/Users/danialbashir/Desktop/Projects/TDT4173-Main-Project-1/Data/ais_train.csv', delimiter='|')
ais_test= pd.read_csv('/Users/danialbashir/Desktop/Projects/TDT4173-Main-Project-1/Data/ais_test.csv')

In [None]:
# =========================
# Parameters
# =========================

number_of_days = 5  # 0 if validating
remove_fraction = 0
max_depth = 23
n_estimators = 60
mode = 'predict'  # or 'validate'
remove_anomalies_speed = True  # Set to True to remove anomalies, False to keep all records
MAX_REALISTIC_SPEED_KNOTS = 30  # Speed threshold for anomalies
trim_train_data = True
remove_cog_sog_anomalies=True

number_of_lagged_coordinates = 1
list_of_features = ['lat_diff', 'lon_diff', 'lag1_lat_diff', 'lag1_lon_diff']
moving_avg_features = []

# New parameter for rolling window size
rolling_window_size = 5  # You can adjust this as needed


# Split data

In [None]:
import pandas as pd

ais_test['time'] = pd.to_datetime(ais_test['time'])
ais_train['time'] = pd.to_datetime(ais_train['time'])

print('Splitting data...')
if mode == 'predict':
    # Extract the target variables for the training set
    X_val = ais_test.copy()
    X_train = ais_train.copy()
    #remove all vessels that are not in the neighboorhood of the test set 
    
print('Data split done')


Splitting data...
Data split done


# Feature engeneering

In [15]:
# Copy datasets
ais_test = X_val.copy()
ais_train = X_train.copy()

# Ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Ensure 'time' is in datetime format and sort data
ais_train['time'] = pd.to_datetime(ais_train['time'])
ais_train.sort_values(['vesselId', 'time'], inplace=True)

# Convert relevant columns to numeric types
numeric_columns = ['latitude_vessel', 'longitude_vessel', 'cog', 'sog']
ais_train[numeric_columns] = ais_train[numeric_columns].apply(pd.to_numeric, errors='coerce')

print(f"Initial training records: {len(ais_train)}")

# =========================
# Anomaly Detection and Interpolation for Latitude/Longitude
# =========================
print(f"Interpolating records with speed_knots > {MAX_REALISTIC_SPEED_KNOTS}...")

# Calculate shifted coordinates
ais_train['prev_latitude'] = ais_train.groupby('vesselId')['latitude_vessel'].shift(1)
ais_train['prev_longitude'] = ais_train.groupby('vesselId')['longitude_vessel'].shift(1)

# Function to compute distance between two points
def compute_distance(row):
    if pd.isnull(row['prev_latitude']) or pd.isnull(row['prev_longitude']):
        return np.nan
    else:
        point1 = (row['prev_latitude'], row['prev_longitude'])
        point2 = (row['latitude_vessel'], row['longitude_vessel'])
        return geodesic(point1, point2).meters

# Calculate distance in meters between consecutive points
ais_train['distance_meters'] = ais_train.apply(compute_distance, axis=1)

# Calculate time difference in seconds within each 'vesselId'
ais_train['time_diff_seconds'] = ais_train.groupby('vesselId')['time'].diff().dt.total_seconds()
ais_train['time_diff_seconds'].replace(0, np.nan, inplace=True)  # Avoid division by zero

# Calculate speed in meters per second and convert to knots
ais_train['speed_m_s'] = ais_train['distance_meters'] / ais_train['time_diff_seconds']
ais_train['speed_knots'] = ais_train['speed_m_s'] * 1.94384

# Flag speed anomalies
ais_train['speed_anomaly'] = ais_train['speed_knots'] > MAX_REALISTIC_SPEED_KNOTS

# Set anomalies to NaN in 'latitude_vessel' and 'longitude_vessel'
ais_train.loc[ais_train['speed_anomaly'], ['latitude_vessel', 'longitude_vessel']] = np.nan

# =========================
# Anomaly Detection and Interpolation for COG/SOG
# =========================
print("Interpolating out-of-range COG and SOG values...")

# Flag COG anomalies
ais_train['cog_anomaly'] = (ais_train['cog'] >= 360) | (ais_train['cog'] < 0)
ais_train['sog_anomaly'] = ais_train['sog'] > 30

# Set out-of-range COG and SOG values to NaN for interpolation
ais_train.loc[ais_train['cog_anomaly'], 'cog'] = np.nan
ais_train.loc[ais_train['sog_anomaly'], 'sog'] = np.nan

# =========================
# Interpolation for All Features
# =========================
# Set 'time' as the index for interpolation
ais_train.set_index('time', inplace=True)

# Interpolate within each 'vesselId' group and fill missing values forward/backward
ais_train[['latitude_vessel', 'longitude_vessel', 'cog', 'sog']] = ais_train.groupby('vesselId')[
    ['latitude_vessel', 'longitude_vessel', 'cog', 'sog']
].transform(lambda group: group.interpolate(method='time').ffill().bfill())

# Reset the index after interpolation
ais_train.reset_index(inplace=True)

# Drop the anomaly flag columns as they're no longer needed
ais_train.drop(columns=['speed_anomaly', 'cog_anomaly', 'sog_anomaly'], inplace=True)

# Drop any remaining NaN values resulting from shifts or interpolation limits
ais_train.dropna(subset=['latitude_vessel', 'longitude_vessel', 'cog', 'sog'], inplace=True)

print(f"Anomalies interpolated. Remaining training records: {len(ais_train)}")


Initial training records: 1522065
Interpolating records with speed_knots > 30...
Interpolating out-of-range COG and SOG values...
Anomalies interpolated. Remaining training records: 1522065


In [None]:
import seaborn as sns
# Ensure the DataFrames are sorted by 'vesselId' and 'time'
ais_train = ais_train.sort_values(['vesselId', 'time'] )
ais_test = ais_test.sort_values(['vesselId', 'time'])

# 3. Calculate the time difference between consecutive rows within each 'vesselId'
ais_train['time_diff_seconds'] = ais_train.groupby('vesselId')['time'].diff().dt.total_seconds()


# 4. Create lagged time difference features
for i in range(1, number_of_lagged_coordinates + 2):
    if i == 1:
        # For lag 1, use the current 'time_diff_seconds' without shifting
        ais_train[f'prev{i}_time_diff_seconds'] = ais_train['time_diff_seconds']
        ais_test[f'prev{i}_time_diff_seconds'] = np.nan
    else:
        # For lags >1, shift 'time_diff_seconds' by (i -1)
        ais_train[f'prev{i}_time_diff_seconds'] = ais_train.groupby('vesselId')['time_diff_seconds'].shift(i - 1)
        ais_test[f'prev{i}_time_diff_seconds'] = np.nan
    
    # Append the feature name to the list of features
for i in range(1, number_of_lagged_coordinates + 1):
    list_of_features.append(f'prev{i}_time_diff_seconds')


# 5. (Optional) Drop the intermediate 'time_diff_seconds' column if not needed
ais_train.drop(columns=['time_diff_seconds'], inplace=True)


#generate number_of_lagged_coordinates lagged coordinates and corresponting time differences
for i in range(1, number_of_lagged_coordinates + 2):
    ais_train[f'prev{i}_longitude'] = ais_train.groupby('vesselId')['longitude_vessel'].shift(i)
    ais_train[f'prev{i}_latitude'] = ais_train.groupby('vesselId')['latitude_vessel'].shift(i)
    ais_test[f'prev{i}_longitude'] = np.nan
    ais_test[f'prev{i}_latitude'] = np.nan
for i in range (1, number_of_lagged_coordinates + 1):
    list_of_features.extend([f'prev{i}_longitude', f'prev{i}_latitude'])
    

        
if moving_avg_features:
    for feature in moving_avg_features.copy():
        if feature=='cog':
            #decompose the cog feature
            ais_train['xcog'] = ais_train['cog'].apply(lambda x: np.cos(np.radians(x)))
            ais_train['ycog'] = ais_train['cog'].apply(lambda x: np.sin(np.radians(x)))
            ais_train['xcog_moving_avg'] = ais_train.groupby('vesselId')['xcog'].transform(lambda x: x.rolling(rolling_window_size, min_periods=1).mean())
            ais_train['ycog_moving_avg'] = ais_train.groupby('vesselId')['ycog'].transform(lambda x: x.rolling(rolling_window_size, min_periods=1).mean())
            ais_test['xcog_moving_avg'] = np.nan
            ais_test['ycog_moving_avg'] = np.nan
            moving_avg_features.remove('cog')
            moving_avg_features.append('xcog')
            moving_avg_features.append('ycog')
            list_of_features.append('xcog_moving_avg')
            list_of_features.append('ycog_moving_avg')
        else:
            ais_train[f'{feature}_moving_avg'] = ais_train.groupby('vesselId')[feature].transform(lambda x: x.rolling(rolling_window_size, min_periods=1).mean())
            ais_test[f'{feature}_moving_avg'] = np.nan
            list_of_features.append(f'{feature}_moving_avg')

#drop rows with NaN values due to shift
ais_train.dropna(subset=['prev1_longitude', 'prev1_latitude', 'prev2_longitude', 'prev2_latitude'], inplace=True)
            
for feature in list_of_features.copy():
    if feature == 'weekday':      
        ais_train['weekday'] = ais_train['time'].dt.weekday.astype(int)
        ais_test['weekday'] = ais_test['time'].dt.weekday.astype(int)
    elif feature == 'month':
        ais_train['month'] = ais_train['time'].dt.month
        ais_test['month'] = ais_test['time'].dt.month
    elif feature == 'hour':
        ais_train['hour'] = ais_train['time'].dt.hour
        ais_test['hour'] = ais_test['time'].dt.hour
    # seconds after 1970
    elif feature == 'total_seconds':
        # Calculate `total_seconds` in a consistent manner across training and test sets
        reference_time = min(ais_train['time'].min(), ais_test['time'].min())
        # Update `total_seconds` calculation
        ais_train['total_seconds'] = (ais_train['time'] - reference_time).dt.total_seconds()
        ais_test['total_seconds'] = (ais_test['time'] - reference_time).dt.total_seconds()
    elif feature == 'day_of_year':
        ais_train['day_of_year'] = ais_train['time'].dt.dayofyear
        ais_test['day_of_year'] = ais_test['time'].dt.dayofyear
    elif feature == 'lon_diff':
        ais_train['lon_diff'] = ((ais_train['prev1_longitude'] - ais_train['prev2_longitude'])/ais_train['prev2_time_diff_seconds']*ais_train['prev1_time_diff_seconds'])
        ais_test['lon_diff'] = np.nan
    elif feature == 'lat_diff':
        ais_train['lat_diff'] = (((ais_train['prev1_latitude'] - ais_train['prev2_latitude'])/ais_train['prev2_time_diff_seconds'])*ais_train['prev1_time_diff_seconds'])
        ais_test['lat_diff'] = np.nan
    elif feature == 'distance':
        ais_train['distance'] = ais_train.apply(
        lambda x: geodesic(
            (x['prev2_latitude'], x['prev2_longitude']), 
            (x['prev1_latitude'], x['prev1_longitude'])  # Corrected
            ).meters, 
            axis=1)
        ais_test['distance'] = np.nan  # As per your original logic
    
    elif feature.startswith('lag') and ('lat_diff' in feature or 'lon_diff' in feature):
        lag_num = int(feature.split('_')[0][3:])
        diff_type = 'lat_diff' if 'lat_diff' in feature else 'lon_diff'
        ais_train[feature] = ais_train.groupby('vesselId')[diff_type].shift(lag_num)
        ais_test[feature] = ais_test.groupby('vesselId')[diff_type].shift(lag_num)
    
ais_train['haversine_distance'] = 6371000 * 2 * np.arcsin(
np.sqrt(
np.sin(np.radians((ais_train['latitude_vessel'] - ais_train['prev1_latitude']) / 2))**2 +
np.cos(np.radians(ais_train['prev1_latitude'])) * np.cos(np.radians(ais_train['latitude_vessel'])) *
np.sin(np.radians((ais_train['longitude_vessel'] - ais_train['prev1_longitude']) / 2))**2
    )
)

# Define bins and labels in meters
bins = [-np.inf, 0.375e7, 1.1e7, np.inf]  # <3.75e6, 3.75e6-11e6, >11e6 meters. Found through EDA.
labels = ['cluster_distance_0', 'cluster_distance_1', 'cluster_distance_2']

# Calculate average Haversine distance per vessel
max_haversine = ais_train.groupby('vesselId')['haversine_distance'].max()

# After assigning cluster labels
cluster_labels = pd.cut(max_haversine, bins=bins, labels=[0, 1, 2])

# Create a mapping from 'vesselId' to cluster number
cluster_mapping = cluster_labels.to_dict()

#Map cluster labels to ais_train and ais_test
ais_train['cluster_distance'] = ais_train['vesselId'].map(cluster_mapping)
ais_test['cluster_distance'] = ais_test['vesselId'].map(cluster_mapping)

                
# Create a dictionary mapping vesselId to cluster number
#cluster_dict = ais_train.set_index('vesselId')[['cluster_distance_0', 'cluster_distance_1', 'cluster_distance_2']].idxmax(axis=1).str[-1].astype(int).to_dict()

#print number of vessels in each cluster
print(ais_train['cluster_distance'].value_counts())
#print number of unique vessels in each cluster
print(ais_train.groupby('cluster_distance')['vesselId'].nunique())




ais_train.dropna(subset=list_of_features, inplace=True)

#sort by vesselId and time
ais_train.sort_values(by=['vesselId', 'time'], inplace=True)
ais_test.sort_values(by=['vesselId', 'time'], inplace=True)




cluster_distance
1    666176
0    660716
2    193798
Name: count, dtype: int64
cluster_distance
0    159
1    378
2    150
Name: vesselId, dtype: int64


# Trim train set

In [17]:

if trim_train_data:
    # Find the median number of rows per vessel in the training set
    median_rows_per_vessel = ais_train.groupby('vesselId').size().median()
    print(median_rows_per_vessel)

    # Calculate the number of rows per vessel in the training set
    rows_per_vessel = ais_train.groupby('vesselId').size()

    # Find vessels with fewer than the median number of rows
    vessels_with_few_rows = set(rows_per_vessel[rows_per_vessel < 100].index)

    # Find vessels that are in the training set but not in the test set
    vessels_not_in_test = set(ais_train['vesselId']) - set(ais_test['vesselId'])

    # Combine conditions: vessels with few rows AND not in test set
    vessels_to_remove = vessels_with_few_rows & vessels_not_in_test

    # Remove rows corresponding to these vessels from the training set
    ais_train = ais_train[~ais_train['vesselId'].isin(vessels_to_remove)]

    #print number of negative time differences
    print(ais_train[ais_train['prev1_time_diff_seconds'] < 0].shape[0])
    
if mode == 'validate':
    #ais_test = ais_test[~ais_test['vesselId'].isin(vessels_to_remove)]
    #if boat is not in train set but in test set, remove it from test set
    ais_test = ais_test[ais_test['vesselId'].isin(ais_train['vesselId'])]
    


moving_avg_dicts = {}

for feature in moving_avg_features:
    moving_avg_dicts[f"{feature}_moving_avg"] = {}
    
    
# Initialize with training data
for vessel_id, group in ais_train.groupby('vesselId'):
    #sort by time
    group.sort_values(by='time', inplace=True) 
    for feature in moving_avg_features:
        moving_avg_dicts[f"{feature}_moving_avg"][vessel_id] = deque(group[f'{feature}_moving_avg'].tail(rolling_window_size), maxlen=rolling_window_size)
        
        
def update_moving_averages(vessel_id, feature, new_value):
    moving_avg_dicts[f"{feature}_moving_avg"][vessel_id].append(new_value)
    return np.mean(moving_avg_dicts[f"{feature}_moving_avg"][vessel_id])   
    

1707.0
0


# Define training features

# Train model

In [18]:
print('Training models...')

# Initialize dictionaries to store models for each cluster
models_long = {}
models_lat = {}

# Train separate longitude and latitude models for each cluster
for cluster in [0, 1, 2]:
    print(f'Processing cluster {cluster}...')
    if cluster == 2:
        # For cluster 2, combine data from clusters 0, 1, and 2
        X_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1, 2])][list_of_features]
        y_long_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1, 2])]['longitude_vessel']
        y_lat_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1, 2])]['latitude_vessel']
    elif cluster == 1:
        # For cluster 1, combine data from clusters 0 and 1
        X_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1])][list_of_features]
        y_long_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1])]['longitude_vessel']
        y_lat_cluster = ais_train[ais_train['cluster_distance'].isin([0, 1])]['latitude_vessel']
    else:
        X_cluster = ais_train[ais_train['cluster_distance'] == cluster][list_of_features]
        y_long_cluster = ais_train[ais_train['cluster_distance'] == cluster]['longitude_vessel']
        y_lat_cluster = ais_train[ais_train['cluster_distance'] == cluster]['latitude_vessel']

    if not X_cluster.empty:
        print(f'Training longitude model for cluster {cluster}...')
        models_long[cluster] = RandomForestRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            random_state=42, 
            n_jobs=-1, 
            verbose=0
        ).fit(X_cluster, y_long_cluster)
        
        print(f'Training latitude model for cluster {cluster}...')
        models_lat[cluster] = RandomForestRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            random_state=42, 
            n_jobs=-1, 
            verbose=0
        ).fit(X_cluster, y_lat_cluster)
        
        # Print feature importance
        print(f'Feature importance for longitude model (Cluster {cluster}):')
        for feature, importance in zip(list_of_features, models_long[cluster].feature_importances_):
            print(f'{feature}: {importance:.4f}')
        
        print(f'Feature importance for latitude model (Cluster {cluster}):')
        for feature, importance in zip(list_of_features, models_lat[cluster].feature_importances_):
            print(f'{feature}: {importance:.4f}')
    else:
        print(f"No data for cluster {cluster}. Skipping model training for this cluster.")

print('Model training complete.')


Training models...
Processing cluster 0...
Training longitude model for cluster 0...
Training latitude model for cluster 0...
Feature importance for longitude model (Cluster 0):
lat_diff: 0.0000
lon_diff: 0.0002
lag1_lat_diff: 0.0000
lag1_lon_diff: 0.0000
prev1_time_diff_seconds: 0.0000
prev1_longitude: 0.9965
prev1_latitude: 0.0032
Feature importance for latitude model (Cluster 0):
lat_diff: 0.0006
lon_diff: 0.0003
lag1_lat_diff: 0.0000
lag1_lon_diff: 0.0000
prev1_time_diff_seconds: 0.0003
prev1_longitude: 0.0002
prev1_latitude: 0.9986
Processing cluster 1...
Training longitude model for cluster 1...
Training latitude model for cluster 1...
Feature importance for longitude model (Cluster 1):
lat_diff: 0.0002
lon_diff: 0.0020
lag1_lat_diff: 0.0001
lag1_lon_diff: 0.0000
prev1_time_diff_seconds: 0.0018
prev1_longitude: 0.9957
prev1_latitude: 0.0002
Feature importance for latitude model (Cluster 1):
lat_diff: 0.0013
lon_diff: 0.0004
lag1_lat_diff: 0.0001
lag1_lon_diff: 0.0001
prev1_time_d

#  Predict



In [20]:
# Initialize the prediction set from ais_test
prediction_data = ais_test.copy()
prediction_data['predicted_longitude'] = np.nan
prediction_data['predicted_latitude'] = np.nan

# Set up groups for efficient lookup by vessel
train_set_sorted = ais_train.sort_values(by=['vesselId', 'time'])
ais_train_groups = train_set_sorted.groupby('vesselId')

# Initialize counters for progress tracking
total_rows = len(prediction_data)
progress_interval = max(total_rows // 100, 1)  # Calculate for 1% intervals, minimum 1
processed_rows = 0  # Initialize counter for processed rows

# Set model to use a single thread for faster individual row processing
for model in models_long.values():
    model.set_params(n_jobs=1)
for model in models_lat.values():
    model.set_params(n_jobs=1)

# Loop through each vessel to perform sequential predictions
for vessel_id, group in prediction_data.groupby('vesselId'):
    # Get cluster number for the vessel
    cluster_num = cluster_mapping.get(vessel_id)
    
    # Get past data for this vessel
    past_data = ais_train_groups.get_group(vessel_id)
    # Track last known position within each vessel group
    last_known_data = past_data.iloc[-1].copy()
    
    last_known_data['predicted_longitude'] = last_known_data['longitude_vessel']
    last_known_data['predicted_latitude'] = last_known_data['latitude_vessel']
    
    # Iterate over each row for this vessel in prediction_data
    for idx, (index, row) in enumerate(group.iterrows()):
        dict_avgs = {}
        
        for feature in moving_avg_features:
            if feature == 'latitude_vessel':
                dict_avgs[feature] = update_moving_averages(
                    vessel_id,
                    feature,
                    last_known_data['predicted_latitude']
                )
            elif feature == 'longitude_vessel':
                dict_avgs[feature] = update_moving_averages(
                    vessel_id,
                    feature,
                    last_known_data['predicted_longitude']
                )
            else:
                dict_avgs[feature] = update_moving_averages(
                    vessel_id,
                    feature,
                    last_known_data[f'{feature}_moving_avg']
                )
        
        # Assign moving averages to the current row
        for feature in moving_avg_features:
            prediction_data.at[index, f'{feature}_moving_avg'] = dict_avgs[feature]
        
        # Update lagged time difference dynamically
        time_diff = (row['time'] - last_known_data['time']).total_seconds()
        
        prediction_data.at[index, 'prev1_time_diff_seconds'] = time_diff
        prediction_data.at[index, 'prev2_time_diff_seconds'] = last_known_data['prev1_time_diff_seconds']
        
        # Dynamically update lagged coordinates
        prediction_data.at[index, 'prev1_longitude'] = last_known_data['predicted_longitude']
        prediction_data.at[index, 'prev1_latitude'] = last_known_data['predicted_latitude']
        prediction_data.at[index, 'prev2_latitude'] = last_known_data['prev1_latitude']
        prediction_data.at[index, 'prev2_longitude'] = last_known_data['prev1_longitude']
        
        if 'lat_diff' in list_of_features:
            prediction_data.at[index, 'lat_diff'] = (((last_known_data['predicted_latitude'] - last_known_data['prev1_latitude']) / last_known_data['prev1_time_diff_seconds']) * time_diff)
            prediction_data.at[index, 'lon_diff'] = (((last_known_data['predicted_longitude'] - last_known_data['prev1_longitude']) / last_known_data['prev1_time_diff_seconds']) * time_diff)
        
        if 'distance' in list_of_features:
            prediction_data.at[index, 'distance'] = geodesic(
                (last_known_data['prev1_latitude'], last_known_data['prev1_longitude']),
                (last_known_data['predicted_latitude'], last_known_data['predicted_longitude'])
            ).meters
        
        if 'xcogxtime' in list_of_features:
            prediction_data.at[index, 'xcogxtime'] = prediction_data.at[index, 'xcog_moving_avg'] * time_diff
                
        if 'ycogxtime' in list_of_features:
            prediction_data.at[index, 'ycogxtime'] = prediction_data.at[index, 'ycog_moving_avg'] * time_diff
        
        if 'sogxtime' in list_of_features:
            prediction_data.at[index, 'sogxtime'] = prediction_data.at[index, 'sog_moving_avg'] * time_diff
        
        # Update multiple lag features
        for feature in list_of_features:
            if feature.startswith('lag') and ('lat_diff' in feature or 'lon_diff' in feature):
                lag_num = int(feature.split('_')[0][3:])
                diff_type = 'lat_diff' if 'lat_diff' in feature else 'lon_diff'
                if lag_num == 1:
                    prediction_data.at[index, feature] = last_known_data[diff_type]
                else:
                    prev_lag_feature = f'lag{lag_num - 1}_{diff_type}'
                    prediction_data.at[index, feature] = last_known_data.get(prev_lag_feature, np.nan)


        if number_of_lagged_coordinates > 1:
            # Update higher-order lag features based on latest predictions
            for i in range(2, number_of_lagged_coordinates + 1):  
                # Shift previous predictions back for each lag
                prediction_data.at[index, f'prev{i}_longitude'] = last_known_data.get(f'prev{i-1}_longitude', np.nan)
                prediction_data.at[index, f'prev{i}_latitude'] = last_known_data.get(f'prev{i-1}_latitude', np.nan)
                prediction_data.at[index, f'prev{i}_time_diff_seconds'] = last_known_data.get(f'prev{i-1}_time_diff_seconds', np.nan)
        
        features = prediction_data.loc[index, list_of_features].to_frame().T
        
        if features.isnull().any().any():
            print(f"Invalid features at index {index}. Skipping prediction.")
            continue
                
        # Select the appropriate models based on cluster
        model_long_cluster = models_long.get(cluster_num)
        model_lat_cluster = models_lat.get(cluster_num)
        
        # Make predictions for the current row
        pred_long = model_long_cluster.predict(features)[0]
        pred_lat = model_lat_cluster.predict(features)[0]
        
        # Store predictions
        prediction_data.at[index, 'predicted_longitude'] = pred_long
        prediction_data.at[index, 'predicted_latitude'] = pred_lat
        
        # Update last known data with current prediction
        last_known_data = prediction_data.loc[index].copy()
        last_known_data['time'] = row['time']
        
         # Handling lag_lat_diff and lag_lon_diff
        for feature in list_of_features:
            if feature.startswith('lag') and 'lat_diff' in feature:
                prediction_data.at[index, feature] = last_known_data['lat_diff']
            if feature.startswith('lag') and 'lon_diff' in feature:
                prediction_data.at[index, feature] = last_known_data['lon_diff']
        
        # Increment the counter for processed rows and print progress
        processed_rows += 1
        if processed_rows % progress_interval == 0:
            percent_complete = int((processed_rows / total_rows) * 100)
            print(f"Processing: {percent_complete}% complete.")
                
print("Processing: 100% complete.")



Processing: 0% complete.
Processing: 1% complete.
Processing: 2% complete.
Processing: 3% complete.
Processing: 4% complete.
Processing: 5% complete.
Processing: 6% complete.
Processing: 7% complete.
Processing: 8% complete.
Processing: 9% complete.
Processing: 10% complete.
Processing: 11% complete.
Processing: 12% complete.
Processing: 13% complete.
Processing: 14% complete.
Processing: 15% complete.
Processing: 16% complete.
Processing: 17% complete.
Processing: 18% complete.
Processing: 19% complete.
Processing: 20% complete.
Processing: 21% complete.
Processing: 22% complete.
Processing: 23% complete.
Processing: 24% complete.
Processing: 25% complete.
Processing: 26% complete.
Processing: 27% complete.
Processing: 28% complete.
Processing: 29% complete.
Processing: 30% complete.
Processing: 31% complete.
Processing: 32% complete.
Processing: 33% complete.
Processing: 34% complete.
Processing: 35% complete.
Processing: 36% complete.
Processing: 37% complete.
Processing: 38% comple

In [None]:

#outputs
if mode=='predict':
    
    prediction_data['longitude_predicted'] = prediction_data['predicted_longitude']
    prediction_data['latitude_predicted'] = prediction_data['predicted_latitude']
    detailed_predictions = prediction_data
    
    detailed_predictions.to_csv('Detailed_Predictions.csv', index=False)
    print("Detailed predictions saved to Detailed_Predictions.csv.")

    submission_predictions = prediction_data[['ID', 'longitude_predicted', 'latitude_predicted']]
    submission_predictions.to_csv('Submission.csv', index=False)
    print("Submission predictions saved to Submission.csv.")
    


Detailed predictions saved to Detailed_Predictions.csv.
Submission predictions saved to Submission.csv.
