In [1]:
import re
import pandas as pd

def parse_trip_summary(text):
    """
    Extract key fields from a single trip summary.
    Here we extract the transport mode and keep the full summary.
    You can extend this to parse other fields (e.g., start/end times, distance).
    """
    mode_match = re.search(r"- Transport Mode:\s*(.*)", text)
    transport_mode = mode_match.group(1).strip() if mode_match else "Unknown"
    return {
        "transport_mode": transport_mode,
        "summary": text.strip()
    }

In [2]:
# Read the trip summaries from file
file_path = "./sub_trip_summaries.txt"  # Adjust the path if needed
with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# Each trip summary is assumed to be separated by two newlines.
trip_texts = [t.strip() for t in content.strip().split("\n\n") if t.strip()]

# Parse each trip summary into a structured dict.
data = [parse_trip_summary(trip) for trip in trip_texts]

# Create a DataFrame
df = pd.DataFrame(data)
print("Trip summary counts by transport mode:")
print(df['transport_mode'].value_counts())

Trip summary counts by transport mode:
transport_mode
walk        76
bus         50
bike        35
car         28
subway      20
taxi        11
train        6
airplane     2
Name: count, dtype: int64


In [3]:
import pandas as pd

# Define mappings explicitly
mode_mapping = {
    'taxi': 'car'
}

main_classes = {"walk", "bike", "bus", "car"}

# First, replace 'taxi' with 'car'
df['transport_mode'] = df['transport_mode'].replace(mode_mapping)

# Then, assign 'others' to modes not in main_classes
df['transport_mode'] = df['transport_mode'].where(df['transport_mode'].isin(main_classes), 'others')


print("Updated transport mode counts:")
print(df['transport_mode'].value_counts())

Updated transport mode counts:
transport_mode
walk      76
bus       50
car       39
bike      35
others    28
Name: count, dtype: int64


In [4]:
from openai import AzureOpenAI
# Initialize the Azure OpenAI client
azure_openai = AzureOpenAI(
    azure_endpoint="https://intelligencia-openai-lab02.openai.azure.com/",
    api_key="049425cc99184a619ff068082279749f",
    api_version="2024-02-15-preview"
)

In [5]:
import pandas as pd
import random

# Set the target count for each class after balancing.
target_count = 40

# Function to call Azure OpenAI to generate a new trip summary based on a seed.
def generate_sample_via_chatgpt(seed_summary, transport_mode):
    """
    Generate a new trip summary using Azure OpenAI based on a seed summary.
    Explicitly ensures the generated summary matches the provided transport mode.
    """
    prompt = (
        f"You are a trip summary augmentation assistant.\n\n"
        f"Generate a new trip summary strictly following the structure below. "
        f"The transport mode must be explicitly '{transport_mode}' and the details must be realistic "
        f"and consistent with this transport mode (choose from 'walk', 'bike', 'bus', 'car', 'train').\n\n"
        "Template:\n"
        "Trip Summary:\n"
        "- Start: YYYY-MM-DD HH:MM:SS at [Start Location Address]\n"
        "- End: YYYY-MM-DD HH:MM:SS at [End Location Address]\n"
        "- Duration: X days HH:MM:SS\n"
        "- Distance: X.XX km\n"
        "- Average Speed: XX.XX km/h\n"
        "- Average Bearing Change: XX.XX°\n"
        "- Max Speed: XX.XX km/h\n"
        "- Min Speed: XX.XX km/h\n"
        "- Speed Variability: XX.XX km/h\n"
        "- Average Acceleration: X.XX m/s²\n"
        "- Max Acceleration: XX.XX m/s²\n"
        "- Number of Turns: XX\n"
        "- Turn Rate: XX.XX turns/min\n"
        "- Average Turn Angle: XX.XX°\n"
        "- Turn Angle Variability: XX.XX°\n"
        f"- Transport Mode: {transport_mode}\n\n"
        f"Example Trip Summary for reference:\n{seed_summary}\n\n"
        "Now generate the new trip summary:"
    )

    messages = [
        {"role": "system", "content": "You are a trip summary augmentation assistant."},
        {"role": "user", "content": prompt}
    ]

    response = azure_openai.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.7,
        max_tokens=300
    )

    new_summary = response.choices[0].message.content.strip()

    # Ensure explicitly correct transport mode at the end (extra validation):
    summary_lines = new_summary.splitlines()
    corrected_summary = "\n".join(
        line for line in summary_lines if not line.strip().startswith("- Transport Mode:")
    )
    corrected_summary += f"\n- Transport Mode: {transport_mode}"

    return corrected_summary

# Assume df is your DataFrame containing trip summaries with at least the following columns:
# 'summary' (the text of the trip summary) and 'transport_mode'
# Also assume you have already transformed transport_mode so that only main classes remain, e.g., "walk", "bike", "bus", "car", "Mixed", and "others".

# Step 1: Slightly undersample the majority classes.
undersampled_list = []
for mode, group in df.groupby('transport_mode'):
    if len(group) > target_count:
        # For classes above target, sample without replacement.
        undersampled_group = group.sample(target_count, random_state=42)
    else:
        undersampled_group = group.copy()
    undersampled_list.append(undersampled_group)
undersampled_df = pd.concat(undersampled_list).reset_index(drop=True)

# Step 2: For classes with fewer than target_count samples, augment using ChatGPT.
augmented_samples = []
for mode, group in undersampled_df.groupby('transport_mode'):
    current_count = len(group)
    if current_count < target_count:
        num_to_generate = target_count - current_count
        seed_samples = group['summary'].tolist()
        for _ in range(num_to_generate):
            seed = random.choice(seed_samples)
            new_sample = generate_sample_via_chatgpt(seed, transport_mode=mode)
            augmented_samples.append({
                'transport_mode': mode,
                'summary': new_sample
            })

# Convert augmented samples into a DataFrame.
augmented_df = pd.DataFrame(augmented_samples)

# Combine the undersampled data with the augmented data.
balanced_df = pd.concat([undersampled_df, augmented_df]).reset_index(drop=True)

# Verify the new class counts.
print("Balanced dataset counts:")
print(balanced_df['transport_mode'].value_counts())

Balanced dataset counts:
transport_mode
bike      40
bus       40
car       40
others    40
walk      40
Name: count, dtype: int64


In [6]:
# Save the balanced dataset to a CSV file.
csv_file_path = "balanced_trip_summaries.csv"
balanced_df.to_csv(csv_file_path, index=False)
print(f"Balanced dataset saved as CSV: {csv_file_path}")

# Save the balanced dataset to a JSONL file.
jsonl_file_path = "balanced_trip_summaries.jsonl"
balanced_df.to_json(jsonl_file_path, orient="records", lines=True)
print(f"Balanced dataset saved as JSONL: {jsonl_file_path}")

Balanced dataset saved as CSV: balanced_trip_summaries.csv
Balanced dataset saved as JSONL: balanced_trip_summaries.jsonl


Alright, now that our dataset is ready, let’s create a new notebook to run inference using the DeepSeek model. We’ll pass in our balanced dataset (balanced_trip_summaries.jsonl) to predict the transport mode. This will be done before fine-tuning, as I want to compare the model’s performance before and after fine-tuning.