# HotpotQA Dataset Truncation

This notebook loads the HotpotQA dataset from Google Drive, truncates it to 150 samples using both sequential and random sampling methods, and saves the truncated datasets back to Drive.

## Import Required Libraries

In [1]:
import json
import random
import pandas as pd
import os
from google.colab import drive

## Mount Google Drive

In [2]:
print("Mounting Google Drive...")
drive.mount('/content/drive')

Mounting Google Drive...
Mounted at /content/drive


## Load the HotpotQA Dataset

In [3]:
# Path to the dataset
file_path = '/content/drive/Shareddrives/517 nlp project/data/HotpotQA/dev.json'

# Load the dataset
print(f"Loading dataset from: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Original dataset loaded: {len(data)} samples")

# Display a sample to verify structure
print("\nFirst item structure:")
keys = data[0].keys()
print(f"Keys in data: {list(keys)}")

Loading dataset from: /content/drive/Shareddrives/517 nlp project/data/HotpotQA/dev.json
Original dataset loaded: 7405 samples

First item structure:
Keys in data: ['question', 'answer', 'original_question', 'original_context']


## Analyze Question Types

In [4]:
# Analyze question types if available
if 'type' in data[0]:
    question_types = {}
    for item in data:
        q_type = item.get('type', 'unknown')
        if q_type not in question_types:
            question_types[q_type] = 0
        question_types[q_type] += 1

    print("\nQuestion types distribution:")
    for q_type, count in question_types.items():
        print(f"  {q_type}: {count}")

## Create Truncated Datasets

In [5]:
# Create truncated versions
truncated_first_150 = data[:150]

# Create a randomly sampled version with fixed seed for reproducibility
random.seed(42)
truncated_random_150 = random.sample(data, min(150, len(data)))

## Save Truncated Datasets

In [6]:
# Save paths
output_dir = '/content/drive/Shareddrives/517 nlp project/data/HotpotQA'
first_output_path = os.path.join(output_dir, 'truncated_first_150.json')
random_output_path = os.path.join(output_dir, 'truncated_random_150.json')

# Save the truncated datasets
with open(first_output_path, 'w', encoding='utf-8') as f:
    json.dump(truncated_first_150, f, ensure_ascii=False, indent=2)

with open(random_output_path, 'w', encoding='utf-8') as f:
    json.dump(truncated_random_150, f, ensure_ascii=False, indent=2)

print("\nTruncated datasets saved to:")
print(f"- {first_output_path}")
print(f"- {random_output_path}")


Truncated datasets saved to:
- /content/drive/Shareddrives/517 nlp project/data/HotpotQA/truncated_first_150.json
- /content/drive/Shareddrives/517 nlp project/data/HotpotQA/truncated_random_150.json


## Analyze Dataset Statistics

In [7]:
def analyze_dataset(name, dataset):
    question_lengths = [len(item['question'].split()) for item in dataset]
    answer_lengths = [len(str(item['answer']).split()) for item in dataset]

    stats = {
        "avg_question_len": sum(question_lengths) / len(question_lengths),
        "max_question_len": max(question_lengths),
        "min_question_len": min(question_lengths),
        "avg_answer_len": sum(answer_lengths) / len(answer_lengths),
        "max_answer_len": max(answer_lengths),
        "min_answer_len": min(answer_lengths)
    }

    print(f"\n{name} Statistics:")
    for stat, value in stats.items():
        print(f"  {stat}: {value:.2f}")

    return stats

In [8]:
# Analyze all datasets
orig_stats = analyze_dataset("Original Dataset", data)
first_stats = analyze_dataset("First 150 Samples", truncated_first_150)
random_stats = analyze_dataset("Random 150 Samples", truncated_random_150)


Original Dataset Statistics:
  avg_question_len: 967.77
  max_question_len: 2714.00
  min_question_len: 6.00
  avg_answer_len: 2.46
  max_answer_len: 29.00
  min_answer_len: 1.00

First 150 Samples Statistics:
  avg_question_len: 973.56
  max_question_len: 2286.00
  min_question_len: 130.00
  avg_answer_len: 2.41
  max_answer_len: 14.00
  min_answer_len: 1.00

Random 150 Samples Statistics:
  avg_question_len: 974.79
  max_question_len: 1600.00
  min_question_len: 355.00
  avg_answer_len: 2.56
  max_answer_len: 16.00
  min_answer_len: 1.00


## Summary

In [9]:
result = {
    "original": data,
    "first_150": truncated_first_150,
    "random_150": truncated_random_150,
    "stats": {
        "original": orig_stats,
        "first_150": first_stats,
        "random_150": random_stats
    }
}

print("\nProcess completed successfully!")


Process completed successfully!
