In [3]:
#!/usr/bin/env python
# coding: utf-8

# # DeepSeek Inference on Transport Mode Prediction
#
# This script runs inference using the local DeepSeek-R1-Distill-Qwen-14B model to predict transport modes from trip summaries.
# It uses 4-bit quantization and QLoRA fine-tuning as described in your report.

import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries for local model inference
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

## Load the Balanced Dataset

In [None]:
data = []
with open('balanced_trip_summaries.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

print(f"Dataset size: {len(df)} records")
print("\nTransport mode distribution:")
print(df['transport_mode'].value_counts())
print("\nFirst few records:")
print(df.head())

# ## Setup Local DeepSeek-R1-Distill-Qwen-14B Model

MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
print(f"Loading model {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Create a quantization configuration object:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",          # You can choose the quantization type: "nf4" or "fp4"
    bnb_4bit_compute_dtype=torch.float16  # Set the compute data type (e.g., torch.float16)
)

# Load the model with the quantization config:
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True
)

Dataset size: 200 records

Transport mode distribution:
transport_mode
bike      40
bus       40
car       40
others    40
walk      40
Name: count, dtype: int64

First few records:
  transport_mode                                            summary
0           bike  Trip Summary:\n- Start: 2011-11-08 20:48:08 at...
1           bike  Trip Summary:\n- Start: 2008-04-26 04:43:58 at...
2           bike  Trip Summary:\n- Start: 2008-07-09 12:34:11 at...
3           bike  Trip Summary:\n- Start: 2008-08-12 02:18:58 at...
4           bike  Trip Summary:\n- Start: 2008-07-06 17:20:24 at...
Loading model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B ...


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

## Define Inference Function

In [None]:
def predict_transport_mode(summary, max_new_tokens=10):
    """
    Predict the transport mode from a trip summary using the local DeepSeek-R1-Distill-Qwen-14B model.
    
    Args:
        summary (str): The trip summary text.
        max_new_tokens (int): Maximum tokens to generate.
        
    Returns:
        str: Predicted transport mode.
    """
    prompt = f"""
You are a transportation mode analysis expert. Analyze the trip summary below and determine the most likely mode of transportation used.

Pay close attention to these key indicators:
- Average speed and speed variations
- Acceleration patterns
- Number of turns and turn rates
- Duration and distance
- Start and end locations

The trip summary includes various metrics that can help identify the transportation mode:
- Walking typically has slow speeds (3-6 km/h), low acceleration, and potentially high turn rates
- Biking usually shows moderate speeds (10-20 km/h), moderate acceleration, and varied turn patterns
- Bus travel shows moderate speeds (15-30 km/h), lower acceleration, and fewer turns per km
- Cars typically have higher speeds (30-80 km/h), higher acceleration capabilities, and varied turn patterns
- Subway/train travel often has high speeds, very consistent acceleration/deceleration patterns

Based on your analysis, classify the transportation mode as one of: bike, bus, car, subway, train, or walk.
Only respond with the single word for the most likely transportation mode.

Trip Summary:
{summary}
"""
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids, max_new_tokens=max_new_tokens, temperature=0.0, do_sample=False)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_mode = generated_text.strip().lower()
    
    # Clean up the prediction if needed
    if predicted_mode in ['bike', 'bicycle', 'cycling']:
        return 'bike'
    elif predicted_mode in ['bus', 'coach']:
        return 'bus'
    elif predicted_mode in ['car', 'taxi', 'automobile', 'drive']:
        return 'car'
    elif predicted_mode in ['subway', 'metro', 'underground']:
        return 'subway'
    elif predicted_mode in ['train', 'rail']:
        return 'train'
    elif predicted_mode in ['walk', 'walking', 'on foot']:
        return 'walk'
    else:
        return predicted_mode

# ## Test with a Single Example

test_summary = df.iloc[0]['summary']
true_mode = df.iloc[0]['transport_mode']

print(f"\nTest summary:\n{test_summary}\n")
print(f"True transport mode: {true_mode}")

# Uncomment the next lines when you're ready to run inference
# predicted_mode = predict_transport_mode(test_summary)
# print(f"Predicted transport mode: {predicted_mode}")

# ## Run Inference on the Full Dataset
#
# Note: This will run inference for each example.

def run_inference(df, sample_size=None):
    """
    Run inference on the dataset and return results.
    
    Args:
        df (DataFrame): The dataset.
        sample_size (int): Optional sample size to limit inference.
        
    Returns:
        DataFrame: Original dataframe with predictions added.
    """
    if sample_size and sample_size < len(df):
        df_sample = df.groupby('transport_mode').apply(
            lambda x: x.sample(min(len(x), sample_size // len(df['transport_mode'].unique())))
        ).reset_index(drop=True)
        print(f"Using a stratified sample of {len(df_sample)} examples")
    else:
        df_sample = df
        print(f"Using all {len(df_sample)} examples")
    
    results_df = df_sample.copy()
    results_df['predicted_mode'] = None
    
    for idx, row in tqdm(results_df.iterrows(), total=len(results_df), desc="Running inference with local DeepSeek model"):
        summary = row['summary']
        predicted_mode = predict_transport_mode(summary)
        results_df.at[idx, 'predicted_mode'] = predicted_mode
        
    return results_df

# Set this to a small number to test, for example, 60 (10 examples per mode for 6 modes)
SAMPLE_SIZE = 80

# Run inference
# When ready to run, uncomment the line below:
# results_df = run_inference(df, sample_size=SAMPLE_SIZE)

# ## Evaluate Results

def evaluate_results(results_df):
    """
    Evaluate the results of the inference.
    
    Args:
        results_df (DataFrame): DataFrame with true and predicted modes.
    """
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    
    y_true = results_df['transport_mode']
    y_pred = results_df['predicted_mode']
    
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    classes = sorted(results_df['transport_mode'].unique())
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    
    per_class_acc = {}
    for cls in classes:
        mask = y_true == cls
        if mask.sum() > 0:
            per_class_acc[cls] = (y_pred[mask] == cls).mean()
        else:
            per_class_acc[cls] = 0
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(per_class_acc.keys()), y=list(per_class_acc.values()))
    plt.axhline(y=accuracy, color='r', linestyle='--', label=f'Overall Accuracy: {accuracy:.4f}')
    plt.ylim(0, 1.1)
    plt.ylabel('Accuracy')
    plt.xlabel('Transport Mode')
    plt.title('Per-Class Accuracy')
    plt.legend()
    plt.show()
    
    return {
        'accuracy': accuracy,
        'per_class_accuracy': per_class_acc,
        'classification_report': classification_report(y_true, y_pred, output_dict=True)
    }

# When you have results, uncomment to evaluate
# metrics = evaluate_results(results_df)

# ## Save Results

def save_results(results_df, metrics, model_name):
    """
    Save the results and metrics to files.
    
    Args:
        results_df (DataFrame): DataFrame with predictions.
        metrics (dict): Evaluation metrics.
        model_name (str): Name of the model used.
    """
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    
    filename = f"{model_name}_predictions_{timestamp}.csv"
    results_df.to_csv(filename, index=False)
    print(f"Saved predictions to {filename}")
    
    metrics_filename = f"{model_name}_metrics_{timestamp}.json"
    with open(metrics_filename, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"Saved metrics to {metrics_filename}")

# When you have results, uncomment to save
# save_results(results_df, metrics, MODEL_NAME)

# ## Running the script

if __name__ == "__main__":
    print("\n" + "="*50)
    print("Local DeepSeek Inference Script for Transport Mode Prediction")
    print("="*50)
    
    print("\nTo run inference with the local DeepSeek model:")
    print("1. Ensure the DeepSeek model and tokenizer are properly installed and accessible.")
    print("2. Uncomment the results_df = run_inference() line.")
    print("3. Uncomment the metrics = evaluate_results() line.")
    print("4. Uncomment the save_results() line if you want to save the results.")
    
    print("\nNOTE: This script performs local inference and does not incur API costs.")