# This Notebook is to extract and generate the dataset as required for classifier


In [2]:
from dotenv import load_dotenv
import os
from huggingface_hub import login
# Load environment variables from .env file
load_dotenv()
# Login to Hugging Face using the token from environment variables


True

In [3]:
from datasets import load_dataset
dataset = load_dataset("SoccerNet/SN-echoes", "whisper_v3", split="en")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating original split: 100%|██████████| 923181/923181 [00:00<00:00, 1309003.95 examples/s]
Generating en split: 100%|██████████| 679738/679738 [00:00<00:00, 1643104.48 examples/s]


In [4]:
for i in range(3):
    print(dataset[i]["game"])
    print(dataset[i]["text"])  # [start, end, text]


england_epl/2014-2015/2015-02-21 - 18-00 Chelsea 1 - 1 Burnley/1
and everything is possible
england_epl/2014-2015/2015-02-21 - 18-00 Chelsea 1 - 1 Burnley/1
Felipe Luis for Azpilicueta and Zouma for Kejil, the rest of the team is the one that has been playing all season and almost immovable, so we certainly have a more than competitive team in Chelsea
england_epl/2014-2015/2015-02-21 - 18-00 Chelsea 1 - 1 Burnley/1
César opened towards Eden Hazard, Hazard who was looking to gain the baseline there, leans on Cuadrado, Cuadrado who leaves that ball behind for Matic, Chelsea already dominating territorially after this first minute and a half of play


In [9]:
import os
import re
import csv
from tqdm import tqdm
from datasets import load_dataset

# Path to output file
output_file = "data/football_commentary_labelled.csv"

# ✅ Ensure the directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# 🧠 Heuristic labeler
def label_line(text: str) -> str:
    text_lower = text.lower()
    
    if re.search(r'\b(last season|last year|in \d{4}|back in|previous game|previous match|earlier|in the past)\b', text_lower):
        return "PAST"
    if re.search(r'\b(born|grew up|idol|favorite|supports|childhood|as a kid|family|loves)\b', text_lower):
        return "IRRELEVANT"
    return "CURRENT"

# 📦 Load dataset
print("📦 Loading SoccerNet-Echoes...")
dataset = load_dataset("SoccerNet/SN-echoes", "whisper_v3", split="en")

# 💾 Write to CSV
print(f"📝 Writing labeled data to {output_file}...")
with open(output_file, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["game", "text", "label"])
    
    for item in tqdm(dataset, desc="Labelling lines"):
        writer.writerow([item["game"], item["text"], label_line(item["text"])])

print(f"\n✅ Done! Saved to: {output_file}")


📦 Loading SoccerNet-Echoes...
📝 Writing labeled data to data/football_commentary_labelled.csv...


Labelling lines: 100%|██████████| 679738/679738 [01:07<00:00, 10049.03it/s]


✅ Done! Saved to: data/football_commentary_labelled.csv





In [13]:
import pandas as pd

In [15]:
df = pd.read_csv("data/football_commentary_labelled.csv")
label_counts = df['label'].value_counts()
print(label_counts)

label
CURRENT       676935
PAST            2003
IRRELEVANT       800
Name: count, dtype: int64


## So under sample CURRENT -> 50K and over sample PAST, IRRELEVANT -> 50K

In [16]:
import pandas as pd
from sklearn.utils import resample
import os

# 📥 Load the dataset
input_file = "data/football_commentary_labelled.csv"
output_file = "data/football_commentary_balanced.csv"

# Ensure path exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

print("🔄 Loading dataset...")
df = pd.read_csv(input_file)

# 🎯 Split by label
df_current = df[df["label"] == "CURRENT"]
df_past = df[df["label"] == "PAST"]
df_irrelevant = df[df["label"] == "IRRELEVANT"]

# 🔽 Downsample CURRENT to 50,000
df_current_down = resample(df_current,
                           replace=False,
                           n_samples=50000,
                           random_state=42)

# 🔼 Upsample PAST to 50,000
df_past_up = resample(df_past,
                      replace=True,
                      n_samples=50000,
                      random_state=42)

# 🔼 Upsample IRRELEVANT to 50,000
df_irrelevant_up = resample(df_irrelevant,
                            replace=True,
                            n_samples=50000,
                            random_state=42)

# 🧩 Combine all
balanced_df = pd.concat([df_current_down, df_past_up, df_irrelevant_up])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 💾 Save to CSV
balanced_df.to_csv(output_file, index=False)
print(f"✅ Balanced dataset saved to: {output_file}")
print(balanced_df['label'].value_counts())


🔄 Loading dataset...
✅ Balanced dataset saved to: data/football_commentary_balanced.csv
label
PAST          50000
CURRENT       50000
IRRELEVANT    50000
Name: count, dtype: int64
