# **WalkSense: Accelerating Human Mobility Prediction**

##**Loading Data**

Mounting Google Drive at /content/drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing pandas
import pandas as pd

# Reading the ".plt" files into a pandas DataFrame
df11 = pd.read_csv('/content/drive/MyDrive/USTPE Project/Data/010/Trajectory/20070828171302.plt', skiprows=6, header=None, names=['Latitude', 'Longitude', 'Zero', 'Altitude','Days_Elapsed','Date','Time'])
df12 = pd.read_csv('/content/drive/MyDrive/USTPE Project/Data/010/Trajectory/20070804033032.plt', skiprows=6, header=None, names=['Latitude', 'Longitude', 'Zero', 'Altitude','Days_Elapsed','Date','Time'])
df21 = pd.read_csv('/content/drive/MyDrive/USTPE Project/Data/020/Trajectory/20110825143825.plt', skiprows=6, header=None, names=['Latitude', 'Longitude', 'Zero', 'Altitude','Days_Elapsed','Date','Time'])
df22 = pd.read_csv('/content/drive/MyDrive/USTPE Project/Data/020/Trajectory/20110828043331.plt', skiprows=6, header=None, names=['Latitude', 'Longitude', 'Zero', 'Altitude','Days_Elapsed','Date','Time'])

In [None]:
df11.head()

Unnamed: 0,Latitude,Longitude,Zero,Altitude,Days_Elapsed,Date,Time
0,39.900917,116.420018,0,500,39322.717384,2007-08-28,17:13:02
1,39.900923,116.420042,0,500,39322.717396,2007-08-28,17:13:03
2,39.90095,116.420138,0,500,39322.717419,2007-08-28,17:13:05
3,39.90096,116.420252,0,500,39322.717442,2007-08-28,17:13:07
4,39.900962,116.420263,0,500,39322.717454,2007-08-28,17:13:08


In [None]:
df11.describe()

Unnamed: 0,Latitude,Longitude,Zero,Altitude,Days_Elapsed
count,2903.0,2903.0,2903.0,2903.0,2903.0
mean,39.472944,116.78761,0.0,84.364451,39322.743018
std,0.266158,0.361608,0.0,127.43826,0.014008
min,39.118588,116.268102,0.0,0.0,39322.717384
25%,39.195226,116.424026,0.0,11.0,39322.730897
50%,39.455523,116.800987,0.0,28.0,39322.74338
75%,39.705938,117.18075,0.0,116.0,39322.756019
max,39.901542,117.24275,0.0,500.0,39322.765081


In [None]:
import folium
from folium.plugins import HeatMap, FeatureGroupSubGroup

# Create a base map
m = folium.Map(location=[df11['Latitude'].mean(), df11['Longitude'].mean()], zoom_start=10)

# Create a parent feature group to hold the subgroups
parent_fg = folium.FeatureGroup(name='Maps')

# Convert the latitude and longitude columns to a list of coordinate pairs
coordinates11 = df11[['Latitude', 'Longitude']].values.tolist()
coordinates12 = df12[['Latitude', 'Longitude']].values.tolist()
coordinates21 = df21[['Latitude', 'Longitude']].values.tolist()
coordinates22 = df22[['Latitude', 'Longitude']].values.tolist()

# Add the heatmap layers to the maps
HeatMap(coordinates11).add_to(m)
HeatMap(coordinates12).add_to(m)
HeatMap(coordinates21).add_to(m)
HeatMap(coordinates22).add_to(m)

# Create FeatureGroupSubGroup for each map
fg1 = FeatureGroupSubGroup(name='Map 1', group=parent_fg)
fg2 = FeatureGroupSubGroup(name='Map 2', group=parent_fg)
fg3 = FeatureGroupSubGroup(name='Map 3', group=parent_fg)
fg4 = FeatureGroupSubGroup(name='Map 4', group=parent_fg)

# Add the FeatureGroupSubGroups to the parent feature group
parent_fg.add_child(fg1)
parent_fg.add_child(fg2)
parent_fg.add_child(fg3)
parent_fg.add_child(fg4)

# Add the parent feature group to the base map
m.add_child(parent_fg)

# Add LayerControl to control the visibility of the FeatureGroupSubGroups
folium.LayerControl(collapsed=False).add_to(m)

# Display the map
m

In [None]:
import os
import pandas as pd
import shutil
import math

# Set the directory path
directory = '/content/drive/MyDrive/USTPE Project/Data'

# Set the output directory path for refined data
output_directory = '/content/drive/MyDrive/USTPE Project/Refined Data'

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface
    using the Haversine formula.
    """
    R = 6371  # Radius of the Earth in kilometers

    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = math.sin(dlat/2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    distance = R * c
    return distance

def calculate_speed(df):
    """
    Calculate the speed and distance based on the coordinates and time in a DataFrame.
    """
    speeds = []
    distances = []

    for i in range(len(df)-1):
        lat1 = df.loc[i, 'Latitude']
        lon1 = df.loc[i, 'Longitude']
        lat2 = df.loc[i+1, 'Latitude']
        lon2 = df.loc[i+1, 'Longitude']

        time_diff = (df.loc[i+1, 'Timestamp'] - df.loc[i, 'Timestamp']).total_seconds()
        distance = haversine(lat1, lon1, lat2, lon2)
        speed = distance / time_diff

        speeds.append(speed)
        distances.append(distance)

    speeds.append(0)  # Assuming speed at the last point is 0
    distances.append(0)  # Assuming distance at the last point is 0

    df['Distance'] = distances
    df['Speed'] = speeds

    return df

def calculate_acceleration(df):
    """
    Calculate the acceleration based on the speed in a DataFrame.
    """
    accelerations = []

    for i in range(len(df)-1):
        speed1 = df.loc[i, 'Speed']
        speed2 = df.loc[i+1, 'Speed']
        time_diff = (df.loc[i+1, 'Timestamp'] - df.loc[i, 'Timestamp']).total_seconds()

        acceleration = (speed2 - speed1) / time_diff
        accelerations.append(acceleration)

    accelerations.append(0)  # Assuming acceleration at the last point is 0

    df['Acceleration'] = accelerations

    return df

# Iterate over all folders in the directory
for folder in os.listdir(directory):
    folder_path = os.path.join(directory, folder)

    # Check if it is a directory
    if os.path.isdir(folder_path):

        # Check if labels.txt file exists in the folder
        labels_file = os.path.join(folder_path, 'labels.txt')
        if os.path.exists(labels_file):

            # Iterate over all trajectory sub-folders in the folder
            for trajectory_folder in os.listdir(folder_path):
                trajectory_folder_path = os.path.join(folder_path, trajectory_folder)

                # Check if it is a directory
                if os.path.isdir(trajectory_folder_path):

                    # Iterate over all .plt files inside the trajectory folders
                    for file in os.listdir(trajectory_folder_path):
                        file_path = os.path.join(trajectory_folder_path, file)

                        # Check if it is a .plt file
                        if file.endswith('.plt'):

                            # Load the data into a DataFrame
                            df = pd.read_csv(file_path, skiprows=6, header=None, names=['Latitude', 'Longitude', 'Zero', 'Altitude', 'Days_Elapsed', 'Date', 'Time'])

                            # Remove the 'Zero' and 'Days_Elapsed' columns
                            df = df.drop(['Zero', 'Days_Elapsed'], axis=1)

                            # Convert the timestamp column to datetime
                            df['Timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

                            # Group the data by minute intervals and select the first non-null row of each interval
                            reduced_df = df.groupby(pd.Grouper(key='Timestamp', freq='1min')).first().dropna().reset_index(drop=False)

                            # Calculate speed and acceleration per record
                            reduced_df_with_speed = calculate_speed(reduced_df)
                            reduced_df_with_speed_and_acceleration = calculate_acceleration(reduced_df_with_speed)

                            # Get the folder name to use for saving the refined data
                            output_folder_name = folder

                            # Create the output folder path for refined data
                            output_folder_path = os.path.join(output_directory, output_folder_name)
                            os.makedirs(output_folder_path, exist_ok=True)

                            # Construct the output file paths
                            output_file_path = os.path.join(output_folder_path, file)
                            output_labels_path = os.path.join(output_folder_path, 'labels.txt')

                            # Save the reduced DataFrame with speed and acceleration to a new file
                            reduced_df_with_speed_and_acceleration.to_csv(output_file_path, index=False)

                            # Copy the labels.txt file to the new folder
                            shutil.copy(labels_file, output_labels_path)

In [None]:
# Experiment

import pandas as pd
import os
import numpy as np

encoded_data = []
labels_df = pd.read_csv('/content/drive/MyDrive/USTPE Project/Refined Data/179/labels.txt', delimiter='\t')
df = pd.read_csv('/content/drive/MyDrive/USTPE Project/Refined Data/179/20080820120904.plt', skiprows=6, header=None, names=['Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Date', 'Time', 'Distance', 'Speed', 'Acceleration'])
# Encode the labels based on time ranges
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Encoded_Mode'] = ''  # Default mode is blank

for _, label_row in labels_df.iterrows():
    start_time = pd.to_datetime(label_row['Start Time'])
    end_time = pd.to_datetime(label_row['End Time'])
    mask = (start_time <= df['Timestamp']) & (df['Timestamp'] <= end_time)
    df.loc[mask, 'Encoded_Mode'] = 'walk' if label_row['Transportation Mode'] == 'walk' else 'transport'

# Remove rows with blank Encoded_Mode
df = df[df['Encoded_Mode'] != '']

encoded_data.append(df)

# Concatenate all the encoded dataframes
combined_encoded_data = pd.concat(encoded_data)

print(combined_encoded_data.to_string())

             Timestamp   Latitude   Longitude  Altitude        Date      Time  Distance     Speed  Acceleration Encoded_Mode
0  2008-08-20 12:14:00  39.974810  116.333226     179.0  2008-08-20  12:14:01  0.074115  0.001235 -8.691202e-06         walk
1  2008-08-20 12:15:00  39.975195  116.333936     165.0  2008-08-20  12:15:01  0.042826  0.000714  9.922603e-06         walk
2  2008-08-20 12:16:00  39.975047  116.334400     175.0  2008-08-20  12:16:01  0.078548  0.001309 -6.134732e-06         walk
3  2008-08-20 12:17:00  39.975387  116.333592     162.0  2008-08-20  12:17:01  0.056463  0.000941  6.623209e-07         walk
4  2008-08-20 12:18:00  39.975453  116.332935     149.0  2008-08-20  12:18:01  0.058847  0.000981 -1.081932e-06         walk
5  2008-08-20 12:19:00  39.975392  116.332249     137.0  2008-08-20  12:19:01  0.054952  0.000916 -8.151334e-06         walk
6  2008-08-20 12:20:00  39.975418  116.331605     151.0  2008-08-20  12:20:01  0.076821  0.000427  8.564261e-06         walk


In [None]:
import os
import pandas as pd

base_folder = '/content/drive/MyDrive/USTPE Project/Refined Data'
encoded_data = []

for folder_name in os.listdir(base_folder):
    folder_path = os.path.join(base_folder, folder_name)
    labels_file = os.path.join(folder_path, 'labels.txt')

    if os.path.isfile(labels_file):
        labels_df = pd.read_csv(labels_file, delimiter='\t')

        # Read the corresponding data files
        data_files = [file for file in os.listdir(folder_path) if file.endswith('.plt')]

        for data_file in data_files:
            df = pd.read_csv(os.path.join(folder_path, data_file), skiprows=6, header=None, names=['Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Date', 'Time', 'Distance', 'Speed', 'Acceleration'])

            # Encode the labels based on time ranges
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            df['Encoded_Mode'] = ''  # Default mode is blank

            for _, label_row in labels_df.iterrows():
                start_time = pd.to_datetime(label_row['Start Time'])
                end_time = pd.to_datetime(label_row['End Time'])
                mask = (start_time <= df['Timestamp']) & (df['Timestamp'] <= end_time)
                df.loc[mask, 'Encoded_Mode'] = 'walk' if label_row['Transportation Mode'] == 'walk' else 'transport'

            # Remove rows with blank Encoded_Mode
            df = df[df['Encoded_Mode'] != '']

            encoded_data.append(df)

# Concatenate all the encoded dataframes
combined_encoded_data = pd.concat(encoded_data)

In [None]:
import os
import pandas as pd

base_folder = '/content/drive/MyDrive/USTPE Project/Test Data'
encoded_data = []

for folder_name in os.listdir(base_folder):
    folder_path = os.path.join(base_folder, folder_name)
    labels_file = os.path.join(folder_path, 'labels.txt')

    if os.path.isfile(labels_file):
        labels_df = pd.read_csv(labels_file, delimiter='\t')

        # Read the corresponding data files
        data_files = [file for file in os.listdir(folder_path) if file.endswith('.plt')]

        for data_file in data_files:
            df = pd.read_csv(os.path.join(folder_path, data_file), skiprows=6, header=None, names=['Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Date', 'Time', 'Distance', 'Speed', 'Acceleration'])

            # Encode the labels based on time ranges
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            df['Encoded_Mode'] = ''  # Default mode is blank

            for _, label_row in labels_df.iterrows():
                start_time = pd.to_datetime(label_row['Start Time'])
                end_time = pd.to_datetime(label_row['End Time'])
                mask = (start_time <= df['Timestamp']) & (df['Timestamp'] <= end_time)
                df.loc[mask, 'Encoded_Mode'] = 'walk' if label_row['Transportation Mode'] == 'walk' else 'transport'

            # Remove rows with blank Encoded_Mode
            df = df[df['Encoded_Mode'] != '']

            encoded_data.append(df)

# Concatenate all the encoded dataframes
combined_encoded_data = pd.concat(encoded_data)

In [None]:
# Logistic Regression

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Accuracy: 0.7744286561767282
Classification Report:
               precision    recall  f1-score   support

   transport       0.98      0.73      0.84     28318
        walk       0.47      0.95      0.63      7081

    accuracy                           0.77     35399
   macro avg       0.73      0.84      0.73     35399
weighted avg       0.88      0.77      0.80     35399

Accuracy: 0.7544845899601683
Classification Report:
               precision    recall  f1-score   support

   transport       0.78      0.88      0.83     24016
        walk       0.66      0.48      0.56     11383

    accuracy                           0.75     35399
   macro avg       0.72      0.68      0.69     35399
weighted avg       0.74      0.75      0.74     35399

Accuracy: 0.7267436933246701
Classification Report:
               precision    recall  f1-score   support

   transport       0.74      0.83      0.78     21025
        walk       0.70      0.58      0.63     14374

    accuracy           

In [None]:
# KNN

from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the KNN model
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Accuracy: 0.7764343625526144
Classification Report:
               precision    recall  f1-score   support

   transport       0.96      0.75      0.84     28318
        walk       0.47      0.89      0.61      7081

    accuracy                           0.78     35399
   macro avg       0.72      0.82      0.73     35399
weighted avg       0.87      0.78      0.80     35399

Accuracy: 0.8218311251730275
Classification Report:
               precision    recall  f1-score   support

   transport       0.90      0.83      0.86     24016
        walk       0.69      0.80      0.74     11383

    accuracy                           0.82     35399
   macro avg       0.80      0.82      0.80     35399
weighted avg       0.83      0.82      0.82     35399

Accuracy: 0.7897115737732704
Classification Report:
               precision    recall  f1-score   support

   transport       0.83      0.82      0.82     21025
        walk       0.74      0.75      0.74     14374

    accuracy           

In [None]:
# SVM

from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the SVM model
    model = SVC()
    model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Accuracy: 0.7342862792734258
Classification Report:
               precision    recall  f1-score   support

   transport       0.99      0.68      0.80     28318
        walk       0.43      0.97      0.59      7081

    accuracy                           0.73     35399
   macro avg       0.71      0.82      0.70     35399
weighted avg       0.88      0.73      0.76     35399

Accuracy: 0.814260289838696
Classification Report:
               precision    recall  f1-score   support

   transport       0.96      0.76      0.85     24016
        walk       0.65      0.93      0.76     11383

    accuracy                           0.81     35399
   macro avg       0.80      0.85      0.81     35399
weighted avg       0.86      0.81      0.82     35399

Accuracy: 0.7895703268453911
Classification Report:
               precision    recall  f1-score   support

   transport       0.91      0.72      0.80     21025
        walk       0.69      0.89      0.77     14374

    accuracy            

In [None]:
# bagging

from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the bagging model
    base_model = DecisionTreeClassifier()  # Base model to be used in bagging
    bagging_model = BaggingClassifier(base_model, n_estimators=10)  # Number of base models
    bagging_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = bagging_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Accuracy: 0.8117460945224442
Classification Report:
               precision    recall  f1-score   support

   transport       0.95      0.81      0.87     28318
        walk       0.52      0.81      0.63      7081

    accuracy                           0.81     35399
   macro avg       0.73      0.81      0.75     35399
weighted avg       0.86      0.81      0.83     35399

Accuracy: 0.832509392920704
Classification Report:
               precision    recall  f1-score   support

   transport       0.87      0.89      0.88     24016
        walk       0.76      0.71      0.73     11383

    accuracy                           0.83     35399
   macro avg       0.81      0.80      0.80     35399
weighted avg       0.83      0.83      0.83     35399

Accuracy: 0.7739484166219385
Classification Report:
               precision    recall  f1-score   support

   transport       0.78      0.87      0.82     21025
        walk       0.77      0.63      0.69     14374

    accuracy            

In [None]:
# XGBoost

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Train the XGBoost model
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(X_test)

    # Decode the predicted labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred)

    # Evaluate the model
    accuracy = accuracy_score(y_encoded[test_index], y_pred)
    report = classification_report(y_encoded[test_index], y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)


Accuracy: 0.8213508856182378
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88     28318
           1       0.53      0.92      0.67      7081

    accuracy                           0.82     35399
   macro avg       0.75      0.86      0.78     35399
weighted avg       0.89      0.82      0.84     35399

Accuracy: 0.8570016102149778
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.86      0.89     24016
           1       0.74      0.86      0.79     11383

    accuracy                           0.86     35399
   macro avg       0.83      0.86      0.84     35399
weighted avg       0.87      0.86      0.86     35399

Accuracy: 0.820898895449024
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85     21025
           1       0.77      0.80      0.78     14374

    accuracy            

In [None]:
# XGBoost on Test Data

from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Train the XGBoost model
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = model.predict(X_test)

    # Decode the predicted labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred)

    # Evaluate the model
    accuracy = accuracy_score(y_encoded[test_index], y_pred)
    report = classification_report(y_encoded[test_index], y_pred)

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Accuracy: 0.9849306811332128
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4908
           1       0.45      0.43      0.44        69

    accuracy                           0.98      4977
   macro avg       0.72      0.71      0.72      4977
weighted avg       0.98      0.98      0.98      4977

Accuracy: 0.9541892706449668
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      4603
           1       0.87      0.46      0.60       374

    accuracy                           0.95      4977
   macro avg       0.91      0.73      0.79      4977
weighted avg       0.95      0.95      0.95      4977

Accuracy: 0.8071127185051236
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.94      0.87      3379
           1       0.81      0.52      0.64      1598

    accuracy           

In [None]:
# XGBoost on Test Data with GridSearchCV

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Split the data into features (X) and target (y)
X = combined_encoded_data[['Distance', 'Speed', 'Acceleration']]
y = combined_encoded_data['Encoded_Mode']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Perform time series split
tscv = TimeSeriesSplit(n_splits=5)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=param_grid, cv=tscv)

# Iterate over the splits and perform grid search
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Perform grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict on the testing set
    y_pred = best_model.predict(X_test)

    # Decode the predicted labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred)

    # Evaluate the model
    accuracy = accuracy_score(y_encoded[test_index], y_pred)
    report = classification_report(y_encoded[test_index], y_pred)

    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.9857343781394414
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4908
           1       0.49      0.55      0.52        69

    accuracy                           0.99      4977
   macro avg       0.74      0.77      0.75      4977
weighted avg       0.99      0.99      0.99      4977

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.9610206951979103
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      4603
           1       0.94      0.52      0.67       374

    accuracy                           0.96      4977
   macro avg       0.95      0.76      0.82      4977
weighted avg       0.96      0.96      0.96      4977

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Accurac