In [1]:
import re
import pandas as pd

def parse_trip_summary(text):
    """
    Extract key fields from a single trip summary.
    Here we extract the transport mode and keep the full summary.
    You can extend this to parse other fields (e.g., start/end times, distance).
    """
    mode_match = re.search(r"- Transport Mode:\s*(.*)", text)
    transport_mode = mode_match.group(1).strip() if mode_match else "Unknown"
    return {
        "transport_mode": transport_mode,
        "summary": text.strip()
    }

In [2]:
# Read the trip summaries from file
file_path = "./trip_summaries.txt"  # Adjust the path if needed
with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# Each trip summary is assumed to be separated by two newlines.
trip_texts = [t.strip() for t in content.strip().split("\n\n") if t.strip()]

# Parse each trip summary into a structured dict.
data = [parse_trip_summary(trip) for trip in trip_texts]

# Create a DataFrame
df = pd.DataFrame(data)
print("Trip summary counts by transport mode:")
print(df['transport_mode'].value_counts())

Trip summary counts by transport mode:
transport_mode
walk       62
bus        36
bike       24
taxi       11
car         7
Mixed       3
Unknown     1
Name: count, dtype: int64


In [2]:
import pandas as pd

# Assuming 'df' is your DataFrame containing the trip summaries with a 'transport_mode' column
main_classes = {"walk", "bike", "bus", "car", "Mixed"}

# Transform transport_mode: if not in main_classes, assign "others"
df['transport_mode'] = df['transport_mode'].apply(
    lambda x: x if x in main_classes else "others"
)

print("Updated transport mode counts:")
print(df['transport_mode'].value_counts())

Updated transport mode counts:
transport_mode
walk      442
bike      248
bus       242
others    197
car       169
Mixed     138
Name: count, dtype: int64


In [3]:
from openai import AzureOpenAI
# Initialize the Azure OpenAI client
azure_openai = AzureOpenAI(
    azure_endpoint="https://intelligencia-openai-lab02.openai.azure.com/",
    api_key="049425cc99184a619ff068082279749f",
    api_version="2024-02-15-preview"
)

In [None]:
import pandas as pd
import random

# Set the target count for each class after balancing.
target_count = 300

# Function to call Azure OpenAI to generate a new trip summary based on a seed.
def generate_sample_via_chatgpt(seed_summary):
    """
    Generate a new trip summary using Azure OpenAI based on a seed summary.
    The prompt instructs the model to produce a similar, yet distinct, trip summary.
    """
    prompt = (
        """Generate a realistic, GPS-based textual trip summary strictly adhering to the template below. 
        Only generate summaries for the transport mode {transport_mode}. Replace all placeholders with realistic values. 
        Ensure accuracy, consistency, and realism, including plausible speed, acceleration, distances, and turn metrics 
        according to the specified transport mode ({transport_mode}: choose from “walk”, “bike”, “bus”, “car”, or “train”).
        Trip Summary:
        - Start: YYYY-MM-DD HH:MM:SS at [Start Location Address]
        - End: YYYY-MM-DD HH:MM:SS at [End Location Address]
        - Duration: X days HH:MM:SS
        - Distance: X.XX km
        - Average Speed: XX.XX km/h
        - Average Bearing Change: XX.XX°
        - Max Speed: XX.XX km/h
        - Min Speed: XX.XX km/h
        - Speed Variability: XX.XX km/h
        - Average Acceleration: X.XX m/s²
        - Max Acceleration: XX.XX m/s²
        - Number of Turns: XX
        - Turn Rate: XX.XX turns/min
        - Average Turn Angle: XX.XX°
        - Turn Angle Variability: XX.XX°
        - Transport Mode: {transport_mode}

        **Example Transport Modes and plausible speed ranges:**  
        - walk: 3–6 km/h  
        - bike: 10–25 km/h  
        - bus: 15–50 km/h  
        - car: 20–80 km/h  

        Generate the new trip summary below:
        """
    )
    
    # Build the message list for the completion call.
    messages = [
        {"role": "system", "content": "You are a trip summary augmentation assistant."},
        {"role": "user", "content": prompt}
    ]
    
    # Call the Azure OpenAI API. Adjust model name and parameters as needed.
    response = azure_openai.chat.completions.create(
        model="gpt-4o",  # Replace with your actual deployment name if needed
        messages=messages,
        temperature=0.7,  # Adjust for creativity if desired
        max_tokens=300    # Adjust based on expected summary length
    )
    
    new_summary = response.choices[0].message.content.strip()
    return new_summary

# Assume df is your DataFrame containing trip summaries with at least the following columns:
# 'summary' (the text of the trip summary) and 'transport_mode'
# Also assume you have already transformed transport_mode so that only main classes remain, e.g., "walk", "bike", "bus", "car", "Mixed", and "others".

# Step 1: Slightly undersample the majority classes.
undersampled_list = []
for mode, group in df.groupby('transport_mode'):
    if len(group) > target_count:
        # For classes above target, sample without replacement.
        undersampled_group = group.sample(target_count, random_state=42)
    else:
        undersampled_group = group.copy()
    undersampled_list.append(undersampled_group)
undersampled_df = pd.concat(undersampled_list).reset_index(drop=True)

# Step 2: For classes with fewer than target_count samples, augment using ChatGPT.
augmented_samples = []
for mode, group in undersampled_df.groupby('transport_mode'):
    current_count = len(group)
    if current_count < target_count:
        num_to_generate = target_count - current_count
        # Use existing samples from this class as seeds.
        seed_samples = group['summary'].tolist()
        for _ in range(num_to_generate):
            seed = random.choice(seed_samples)
            new_sample = generate_sample_via_chatgpt(seed)
            augmented_samples.append({
                'transport_mode': mode,
                'summary': new_sample
            })

# Convert augmented samples into a DataFrame.
augmented_df = pd.DataFrame(augmented_samples)

# Combine the undersampled data with the augmented data.
balanced_df = pd.concat([undersampled_df, augmented_df]).reset_index(drop=True)

# Verify the new class counts.
print("Balanced dataset counts:")
print(balanced_df['transport_mode'].value_counts())

Balanced dataset counts:
transport_mode
Mixed     300
bike      300
bus       300
car       300
others    300
walk      300
Name: count, dtype: int64


In [7]:
# Save the balanced dataset to a CSV file.
csv_file_path = "balanced_trip_summaries.csv"
balanced_df.to_csv(csv_file_path, index=False)
print(f"Balanced dataset saved as CSV: {csv_file_path}")

# Save the balanced dataset to a JSONL file.
jsonl_file_path = "balanced_trip_summaries.jsonl"
balanced_df.to_json(jsonl_file_path, orient="records", lines=True)
print(f"Balanced dataset saved as JSONL: {jsonl_file_path}")

Balanced dataset saved as CSV: balanced_trip_summaries.csv
Balanced dataset saved as JSONL: balanced_trip_summaries.jsonl


In [None]:
Alright, now that our dataset is ready, let’s create a new notebook to run inference using the DeepSeek model. We’ll pass in our balanced dataset (balanced_trip_summaries.jsonl) to predict the transport mode. This will be done before fine-tuning, as I want to compare the model’s performance before and after fine-tuning.