## IMDB dataset

In [11]:
import pandas as pd

from pathlib import Path
from typing import List, Tuple, Dict


def read_acl_imdb(root_dir: str) -> Dict[str, pd.DataFrame]:
    """Read ACL IMDB dataset and return dict with train/test DataFrames.
    Labels: 1 = positive, 0 = negative.
    """
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(f"Dataset root not found: {root}")

    def read_split(split: str) -> pd.DataFrame:
        texts: List[str] = []
        labels: List[int] = []
        for label_dir, label in (("pos", 1), ("neg", 0)):
            path = root / split / label_dir
            if not path.exists():
                raise FileNotFoundError(f"Missing folder: {path}")
            for file_path in sorted(path.glob("*.txt")):
                texts.append(file_path.read_text(encoding="utf-8", errors="ignore"))
                labels.append(label)
        df = pd.DataFrame({"text": texts, "label": labels})
        df["label_name"] = df["label"].map({0: "negative", 1: "positive"})
        return df

    train_df = read_split("train")
    test_df = read_split("test")
    return {"train": train_df, "test": test_df}

dataset = read_acl_imdb("/home/snt/projects_lujun/LabAgentSkill/assets/datasets/aclImdb")
train_df = dataset["train"]
test_df = dataset["test"]

In [21]:
test_df["len"] = test_df["text"].str.len()



In [24]:
test_df = test_df[(300<test_df["len"]) & (test_df["len"] < 500)]

In [25]:
# 每个label随机选择150个样本
test_df_sampled = pd.concat([
    test_df[test_df['label'] == 0].sample(n=150, random_state=42),
    test_df[test_df['label'] == 1].sample(n=150, random_state=42)
], ignore_index=True)

print(f"原始测试集: {len(test_df)} 样本")
print(f"采样后测试集: {len(test_df_sampled)} 样本")
print(f"\n采样后label分布:\n{test_df_sampled['label_name'].value_counts()}")
print(f"\n列名: {list(test_df_sampled.columns)}")
test_df_sampled.head()

原始测试集: 1694 样本
采样后测试集: 300 样本

采样后label分布:
label_name
negative    150
positive    150
Name: count, dtype: int64

列名: ['text', 'label', 'label_name', 'len']


Unnamed: 0,text,label,label_name,len
0,Tim Curry was the reason I wanted to see this ...,0,negative,345
1,There are no spoilers for this film as nothing...,0,negative,313
2,Dennis Quaid is tryin' hard to prove us that J...,0,negative,403
3,"This is the worst italian movie ever, quite po...",0,negative,366
4,This is a low grade cold war propaganda film c...,0,negative,400


In [26]:
from datasets import Dataset
from huggingface_hub import login
import os

hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN", "")  # Initialize token variable

hf_dataset = Dataset.from_pandas(test_df_sampled)

dataset_name = "Volavion/imdb-sampled-300"
hf_dataset.push_to_hub(
    dataset_name, 
    private=False,
    token=hf_token if hf_token.strip() else None,
    )


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 248.07ba/s]
Processing Files (1 / 1): 100%|██████████| 82.1kB / 82.1kB, 82.1kB/s  
New Data Upload: 100%|██████████| 82.1kB / 82.1kB, 82.1kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.62s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/Volavion/imdb-sampled-300/commit/65f11bfe828029c9eec5d711cd9c29d28acbed84', commit_message='Upload dataset', commit_description='', oid='65f11bfe828029c9eec5d711cd9c29d28acbed84', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Volavion/imdb-sampled-300', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Volavion/imdb-sampled-300'), pr_revision=None, pr_num=None)

In [20]:
from datasets import load_dataset
dataset_name = "Volavion/imdb-sampled-300"
loaded_dataset = load_dataset(dataset_name, split="train")
loaded_df = loaded_dataset.to_pandas()