# Data Preprocessing for Misinformation Analysis

This notebook handles the initial data loading and preprocessing steps for our misinformation analysis task. We will:

1. Load YouTube comments from BigQuery
2. Download and process FakeNewsNet and LIAR datasets
3. Implement multilingual text processing
4. Create a balanced training dataset

## Setup and Dependencies

In [None]:

import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account

from googletrans import Translator
from deep_translator import MyMemoryTranslator
import langdetect
from tqdm import tqdm
import requests
import zipfile
import json
import re
import html
import asyncio
from concurrent.futures import ThreadPoolExecutor

try:
    import nest_asyncio  
    nest_asyncio.apply()
except ImportError:
    nest_asyncio = None

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## 1. Loading YouTube Comments from Local CSVs

We'll load the YouTube comments exported in Task 1 from the local CSV files in `../data/` (e.g., `comment1.csv` ... `comment5.csv`). These comments will be used for both training and analysis.

In [None]:

import glob

# Directory containing comment CSVs (e.g., comment1.csv ... comment5.csv)
comments_dir = '../data'
comment_files = sorted(glob.glob(os.path.join(comments_dir, 'comment*.csv')))

if not comment_files:
    raise FileNotFoundError(f"No comment CSV files found matching 'comment*.csv' in {comments_dir}")

# Diversified sampling across files for speed
TOTAL_TARGET_ROWS = 400_000  # adjust for runtime
PER_FILE_CAP = max(1, TOTAL_TARGET_ROWS // max(1, len(comment_files)))

# Mapped possible column variants to standard names used downstream
column_aliases = {
    'comment_text': ['comment_text', 'text', 'content', 'body'],
    'video_id': ['video_id', 'videoId'],
    'comment_id': ['comment_id', 'commentId', 'id'],
    'author': ['author', 'author_display_name', 'author_name', 'authorChannelId', 'author_channel_id'],
    'published_at': ['published_at', 'publishedAt', 'timestamp', 'created_at', 'createdAt'],
    'like_count': ['like_count', 'likeCount', 'likes'],
    'reply_count': ['reply_count', 'replyCount', 'replies']
}
required_cols = ['comment_text', 'video_id', 'comment_id', 'author', 'published_at', 'like_count', 'reply_count']


def normalize_comment_columns(df: pd.DataFrame) -> pd.DataFrame:
    rename_map = {}
    for std_col, candidates in column_aliases.items():
        for cand in candidates:
            if cand in df.columns:
                rename_map[cand] = std_col
                break
    df = df.rename(columns=rename_map)
    # Ensured required columns exist
    for col in required_cols:
        if col not in df.columns:
            if col in ['like_count', 'reply_count']:
                df[col] = 0
            else:
                df[col] = pd.NA
    return df[required_cols]


parts = []
for path in comment_files:
    print(f"Loading {os.path.basename(path)} ...")
    part = pd.read_csv(path, low_memory=False)
    part = normalize_comment_columns(part)
    # Per-file cap to diversify and reduce runtime
    if len(part) > PER_FILE_CAP:
        part = part.sample(n=PER_FILE_CAP, random_state=42)
    parts.append(part)

# Concatenated
df_comments = pd.concat(parts, ignore_index=True)


if len(df_comments) > TOTAL_TARGET_ROWS:
    df_comments = df_comments.sample(n=TOTAL_TARGET_ROWS, random_state=42)


df_comments['published_at'] = pd.to_datetime(df_comments['published_at'], errors='coerce')
for col in ['like_count', 'reply_count']:
    df_comments[col] = pd.to_numeric(df_comments[col], errors='coerce').fillna(0).astype(int)

print(f"Loaded {len(df_comments)} sampled comments from {len(comment_files)} files (cap ~{TOTAL_TARGET_ROWS})")

Loading comment1.csv ...
Loading comment2.csv ...
Loading comment3.csv ...
Loading comment4.csv ...
Loading comment5.csv ...
Loaded 400000 sampled comments from 5 files (cap ~400000)


## 2. Download and Process Training Datasets

We'll now download and process the FakeNewsNet and LIAR datasets for training our misinformation classifier.

In [None]:
# Create data directory if it doesn't exist
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

# Loaded LIAR dataset from TSV files in liar_dataset/


def load_liar_tsv(filename):
    path = os.path.join(data_dir, 'liar_dataset', filename)
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Expected LIAR TSV at {path}. Place 'train.tsv', 'test.tsv', and 'valid.tsv' in {os.path.dirname(path)}."
        )

    try:
        df_h = pd.read_csv(path, sep='\t', header=0, low_memory=False)
        if {'label', 'statement'}.issubset(set(df_h.columns)):
            df_h = df_h.rename(columns={'statement': 'text'})
            return df_h[['text', 'label']].dropna(subset=['text', 'label'])
    except Exception:
        pass
   
    df = pd.read_csv(
        path,
        sep='\t',
        header=None,
        on_bad_lines='skip',
        engine='python'
    )
    if df.shape[1] < 2:
        raise ValueError(f"Unexpected LIAR TSV shape in {filename}: need at least 2 columns")
    df = pd.DataFrame({
        'text': df.iloc[:, 1].astype(str),
        'label': df.iloc[:, 0].astype(str)
    })
    return df.dropna(subset=['text', 'label'])

# Read LIAR TSV splits
df_liar_train = load_liar_tsv('train.tsv')
df_liar_test = load_liar_tsv('test.tsv')
df_liar_val = load_liar_tsv('valid.tsv')

print(
    f"LIAR TSVs loaded: {len(df_liar_train)} train, {len(df_liar_test)} test, {len(df_liar_val)} validation samples"
)


import glob


def load_fakenewsnet_csvs():
    base_dir = os.path.join(data_dir, 'FakeNewsNetData')
    if not os.path.isdir(base_dir):
        print(f"No FakeNewsNetData directory at {base_dir}")
        return pd.DataFrame(columns=['text', 'label'])
    candidates = sorted(glob.glob(os.path.join(base_dir, '*.csv')))
    frames = []
    for path in candidates:
        try:
            df = pd.read_csv(path, low_memory=False)
           
            if 'title' in df.columns:
                df['text'] = df['title'].astype(str).str.strip()
            elif 'news_url' in df.columns:
                df['text'] = df['news_url'].astype(str).str.strip()
            # Determined label by filename (fake=1, real=0) if no explicit label column exists
            label_col = None
            for c in ['label', 'verdict', 'type', 'class', 'target']:
                if c in df.columns:
                    label_col = c
                    break
            if label_col is not None:
                def map_label(v):
                    if isinstance(v, str):
                        vs = v.strip().lower()
                        if vs in ['fake', 'false', 'pants-fire', 'barely-true']:
                            return 1
                        if vs in ['real', 'true', 'mostly-true', 'half-true']:
                            return 0
                    try:
                        n = int(v)
                        if n in (0, 1):
                            return n
                    except Exception:
                        pass
                    return None
                df['label'] = df[label_col].apply(map_label)
            else:
                name = os.path.basename(path).lower()
                if 'fake' in name:
                    df['label'] = 1
                elif 'real' in name:
                    df['label'] = 0
                else:
                    df['label'] = None
            # Keep only non-empty text rows
            if 'text' not in df.columns:
                print(f"Skipping {os.path.basename(path)}: missing both 'title' and 'news_url' to build text")
                continue
            df['text'] = df['text'].astype(str).str.strip()
            df = df[df['text'] != '']
            df = df[['text', 'label']].dropna(subset=['text', 'label'])
            frames.append(df)
            print(f"Loaded FakeNewsNet CSV: {os.path.basename(path)} with {len(df)} rows")
        except Exception as e:
            print(f"Skipping {os.path.basename(path)}: {e}")
    if frames:
        return pd.concat(frames, ignore_index=True)
    return pd.DataFrame(columns=['text', 'label'])

# Attempt to load FakeNewsNet CSVs
df_fnn = load_fakenewsnet_csvs()

print(f"FakeNewsNet CSV total rows: {len(df_fnn)}")
print(len(df_fnn))


LIAR TSVs loaded: 10200 train, 1262 test, 1283 validation samples
Loaded FakeNewsNet CSV: gossipcop_fake.csv with 5323 rows
Loaded FakeNewsNet CSV: gossipcop_real.csv with 16817 rows
Loaded FakeNewsNet CSV: politifact_fake.csv with 432 rows
Loaded FakeNewsNet CSV: politifact_real.csv with 624 rows
FakeNewsNet CSV total rows: 23196
23196


## 3. Multilingual Text Processing

Now we'll implement language detection and translation for non-English content. This is crucial since our YouTube comments are in multiple languages (RU/UA/EN).

In [None]:

print("Cleaning dataset...")

start_rows = len(df_comments)

# Dropped NAs and whitespace-only comments
df_comments['comment_text'] = df_comments['comment_text'].astype(str)

def clean_text(s: str) -> str:
    s = html.unescape(s)
    s = re.sub(r'<[^>]+>', ' ', s)  # removed HTML tags
    s = re.sub(r'http[s]?://\S+|www\.\S+', ' ', s)  # removed URLs
    s = re.sub(r'[@#]\w+', ' ', s)  # mentions/hashtags
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df_comments['comment_text'] = df_comments['comment_text'].apply(clean_text)



# Deduplicated by IDs and text within the same video
if 'comment_id' in df_comments.columns:
    df_comments = df_comments.drop_duplicates(subset=['comment_id'])



# Reported all the stats
print("Length of the dataset (after cleaning): ", len(df_comments), f"(removed {start_rows - len(df_comments)} rows)")
print("Columns in the dataset: ", df_comments.columns)
print("Number of missing values in the dataset: ", df_comments.isnull().sum())
print("Number of duplicate rows: ", df_comments.duplicated().sum())
print(df_comments.head(3))

Cleaning dataset...
Length of the dataset (after cleaning):  399994 (removed 6 rows)
Columns in the dataset:  Index(['comment_text', 'video_id', 'comment_id', 'author', 'published_at', 'like_count', 'reply_count'], dtype='object')
Number of missing values in the dataset:  comment_text    0
video_id        0
comment_id      0
author          2
published_at    0
like_count      0
reply_count     0
dtype: int64
Number of duplicate rows:  0
                                        comment_text     video_id                  comment_id           author              published_at  like_count  reply_count
0         i meditated to this song and saw my future  zK1mLIeXwsQ  Ugw6Cuof_Y8mvPlKHRV4AaABAg     @kickapoo242 2011-03-24 22:59:00+00:00           0            0
1               thorw back fr man when i was like 10  e2oRqyn7ToQ        UghBak-N0jKrS3gCoAEC  @pabloevenor533 2016-12-29 01:16:38+00:00           0            0
2  for the life of me i cant understand why the l...  RmvCT7j9MFc  Ugx6Io

In [None]:
# Fast multilingual processing with translation enabled
# 1) Quick language detection (ASCII heuristic + optional pycld3 + fallback)
# 2) Full translation of all non-English text with caching and fallback
# 3) Persistent cache to disk for reusability; chunked processing for robustness

import time

# ---- Tunables ----
ASCII_EN_RATIO = 0.98               # pre-label EN if >= 98% ASCII
DETECT_MAX_WORKERS = min(12, max(2, (os.cpu_count() or 8) - 1))
DO_TRANSLATE = True                 # Enable translation for non-English text
TRANSLATE_MAX_UNIQUE = None         # Process as many unique strings as budgets allow (ordered by frequency)
TRANSLATE_TIME_BUDGET_S = 240       # Seconds dedicated to translation within this run (~4 min)
OVERALL_TIME_BUDGET_S = 300         # Upper bound for entire cell (~5 min)
BATCH_SIZE = 120                    # Number of strings sent per request
MAX_RETRIES = 3                     # Retry budget per batch
PAUSE_BETWEEN_BATCHES = 0.25        # Cooldown between requests (seconds)
USE_FALLBACK_TRANSLATOR = False     # Optional secondary provider (disabled by default)
cache_path = os.path.join(data_dir, 'translation_cache.jsonl')


overall_start = time.time()

# Tried with ultra-fast detector
HAVE_CLD3 = False
try:
    import pycld3
    HAVE_CLD3 = True
except Exception:
    HAVE_CLD3 = False


def ascii_ratio(s: str) -> float:
    if not s:
        return 1.0
    return sum(ch.isascii() for ch in s) / len(s)


def detect_lang_one(text: str) -> str:
   
    if ascii_ratio(text) >= ASCII_EN_RATIO:
        return 'en'
    if HAVE_CLD3:
        try:
            r = pycld3.get_language(text)
            if r and r.is_reliable and r.language:
                return r.language
        except Exception:
            pass
    # Fallbacks: langid -> langdetect
    try:
        import langid  # noqa: F401
        code, _ = langid.classify(text)
        return code
    except Exception:
        pass
    try:
        return langdetect.detect(text)
    except Exception:
        return 'unknown'

# Deduplicated for detection
texts = df_comments['comment_text'].astype(str).fillna('')
unique_texts = texts.drop_duplicates()

start_det = time.time()
# Threaded detection to avoid pickling issues in notebooks
with ThreadPoolExecutor(max_workers=DETECT_MAX_WORKERS) as pool:
    unique_langs = list(pool.map(detect_lang_one, unique_texts.tolist()))
lang_map = pd.Series(unique_langs, index=unique_texts.values)
df_comments['detected_lang'] = texts.map(lang_map).fillna('unknown')
det_time = time.time() - start_det
print(f"Language detection done in {det_time:.1f}s with {len(unique_texts)} unique strings.")

# Built/loaded translation cache (text -> translation)
translation_cache: dict[str, str] = {}
if os.path.exists(cache_path):
    try:
        with open(cache_path, 'r') as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    s = obj.get('text'); t = obj.get('translation')
                    if s is not None and t is not None:
                        translation_cache[s] = t
                except Exception:
                    pass
    except Exception:
        pass

print(f"Loaded {len(translation_cache)} cached translations.")


df_comments['translated_text'] = df_comments['comment_text']

if DO_TRANSLATE:
    # Translate non-English texts prioritising the most frequent strings first
    non_en_mask = (df_comments['detected_lang'] != 'en') & (df_comments['detected_lang'] != 'unknown')
    total_non_en_rows = int(non_en_mask.sum())
    non_en_series = df_comments.loc[non_en_mask, 'comment_text'].astype(str)
    freq_series = non_en_series.value_counts()

    candidates = [text for text in freq_series.index if text and text not in translation_cache]
    if TRANSLATE_MAX_UNIQUE is not None:
        candidates = candidates[:TRANSLATE_MAX_UNIQUE]

    if OVERALL_TIME_BUDGET_S is not None and (time.time() - overall_start) >= OVERALL_TIME_BUDGET_S:
        print("Overall time budget exhausted before translation step; skipping translation.")
        candidates = []

    print(f"Translating {len(candidates)} high-impact unique strings (not in cache)...")
    start_tr = time.time()
    translated_unique = 0
    translated_rows = 0

    translator = Translator(service_urls=['translate.googleapis.com'])
    fallback_translator = None
    if USE_FALLBACK_TRANSLATOR:
        try:
            fallback_translator = MyMemoryTranslator(source='en', target='en')
        except Exception as exc:
            fallback_translator = None
            print(f"  Fallback translator unavailable ({exc}); continuing without it.")

    def translate_batch(batch: list[str]) -> list[str]:
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                result = translator.translate(batch, dest='en')
                if not isinstance(result, list):
                    result = [result]
                return [getattr(item, 'text', str(item)) for item in result]
            except Exception:
                wait_time = PAUSE_BETWEEN_BATCHES * attempt
                time.sleep(wait_time)
        return [batch_item for batch_item in batch]

    batches = [candidates[i:i + BATCH_SIZE] for i in range(0, len(candidates), BATCH_SIZE)]
    failed_candidates = 0

    for batch_idx, batch in enumerate(batches, start=1):
        elapsed_overall = time.time() - overall_start
        elapsed_trans = time.time() - start_tr
        if OVERALL_TIME_BUDGET_S is not None and elapsed_overall >= OVERALL_TIME_BUDGET_S:
            print("  Overall time budget reached; stopping translation.")
            break
        if TRANSLATE_TIME_BUDGET_S is not None and elapsed_trans >= TRANSLATE_TIME_BUDGET_S:
            print(f"  Translation time budget ({TRANSLATE_TIME_BUDGET_S}s) reached; stopping translation.")
            break

        print(f"  Processing batch {batch_idx}/{len(batches)} ({len(batch)} strings)...")
        translations = translate_batch(batch)

        successes: list[tuple[str, str]] = []
        for original, translated in zip(batch, translations):
            original_clean = original.strip()
            translated_clean = (translated or '').strip()

            if not translated_clean or translated_clean.lower() == original_clean.lower():
                if fallback_translator is not None:
                    try:
                        fallback = (fallback_translator.translate(original) or '').strip()
                    except Exception:
                        fallback = ''
                    if fallback and fallback.lower() != original_clean.lower():
                        translated_clean = fallback
                # If still empty or identical, skip so it can be retried in a future run
            if not translated_clean or translated_clean.lower() == original_clean.lower():
                failed_candidates += 1
                continue

            translation_cache[original] = translated_clean
            translated_unique += 1
            translated_rows += int(freq_series.get(original, 1))
            successes.append((original, translated_clean))

        if successes:
            with open(cache_path, 'a') as f:
                for original, translated_clean in successes:
                    f.write(json.dumps({'text': original, 'translation': translated_clean}, ensure_ascii=False) + "\n")

        if PAUSE_BETWEEN_BATCHES:
            time.sleep(PAUSE_BETWEEN_BATCHES)

    elapsed = time.time() - start_tr if translated_unique else 0.0
    if translated_unique:
        rate = translated_unique / elapsed if elapsed > 0 else float('inf')
        est_coverage = translated_rows / max(1, total_non_en_rows)
        print(f"Translated {translated_unique} unique strings in {elapsed:.1f}s ({rate:.1f} strings/s)")
        print(f"Estimated coverage of non-English rows this run: {est_coverage:.1%}")
        if failed_candidates:
            print(f"  {failed_candidates} strings returned identical text and will be retried on the next run.")
        if translated_unique < len(candidates):
            print("  Note: not all non-English strings were translated this run. Re-run later to continue using the cache.")

    df_comments.loc[non_en_mask, 'translated_text'] = df_comments.loc[non_en_mask, 'comment_text'].map(translation_cache).fillna(df_comments.loc[non_en_mask, 'comment_text'])

# Final report
non_en_mask = (df_comments['detected_lang'] != 'en') & (df_comments['detected_lang'] != 'unknown')
translated_fraction = (df_comments.loc[non_en_mask, 'translated_text'] != df_comments.loc[non_en_mask, 'comment_text']).mean() if non_en_mask.any() else 0.0
print("\n=== Translation Summary ===")
print(f"Non-English rows: {int(non_en_mask.sum())}")
print(f"Translated fraction: {translated_fraction:.3f}")
print(f"Total cache size: {len(translation_cache)} translations")
print(f"Cell runtime: {time.time() - overall_start:.1f}s (target < 300s)")
if DO_TRANSLATE and translated_fraction < 0.999:
    print("Remaining non-English rows will translate on the next run; cached results prevent duplicates.")
print("\nLanguage distribution:")
print(df_comments['detected_lang'].value_counts().head(15))

Language detection done in 84.2s with 374363 unique strings.
Loaded 80326 cached translations.
Translating 64515 high-impact unique strings (not in cache)...
  Processing batch 1/538 (120 strings)...


  translations = translate_batch(batch)


  Processing batch 2/538 (120 strings)...
  Processing batch 3/538 (120 strings)...
  Processing batch 4/538 (120 strings)...
  Processing batch 5/538 (120 strings)...
  Processing batch 6/538 (120 strings)...
  Processing batch 7/538 (120 strings)...
  Processing batch 8/538 (120 strings)...
  Processing batch 9/538 (120 strings)...
  Processing batch 10/538 (120 strings)...
  Processing batch 11/538 (120 strings)...
  Processing batch 12/538 (120 strings)...
  Processing batch 13/538 (120 strings)...
  Processing batch 14/538 (120 strings)...
  Processing batch 15/538 (120 strings)...
  Processing batch 16/538 (120 strings)...
  Processing batch 17/538 (120 strings)...
  Processing batch 18/538 (120 strings)...
  Processing batch 19/538 (120 strings)...
  Processing batch 20/538 (120 strings)...
  Processing batch 21/538 (120 strings)...
  Processing batch 22/538 (120 strings)...
  Processing batch 23/538 (120 strings)...
  Processing batch 24/538 (120 strings)...
  Processing batch 

In [30]:
translated_fraction = (df_comments.loc[non_en_mask, 'translated_text'] != df_comments.loc[non_en_mask, 'comment_text']).mean() if non_en_mask.any() else 0.0
print("\n=== Translation Summary ===")
print(f"Non-English rows: {int(non_en_mask.sum())}")
print(f"Translated fraction: {translated_fraction:.3f}")
print(f"Total cache size: {len(translation_cache)} translations")
print(f"Cell runtime: {time.time() - overall_start:.1f}s (target < 300s)")
if DO_TRANSLATE and translated_fraction < 0.999:
    print("Remaining non-English rows will translate on the next run; cached results prevent duplicates.")
print("\nLanguage distribution:")
print(df_comments['detected_lang'].value_counts().head(15))
print(len(df_comments))



=== Translation Summary ===
Non-English rows: 85564
Translated fraction: 0.110
Total cache size: 80864 translations
Cell runtime: 222.3s (target < 300s)
Remaining non-English rows will translate on the next run; cached results prevent duplicates.

Language distribution:
detected_lang
en    314430
ja     12937
zh      8259
ar      7586
es      7367
ru      5779
pt      4609
ko      3045
fr      2751
tr      2594
la      2358
hi      1238
km      1164
mr      1153
fa       993
Name: count, dtype: int64
399994


In [31]:
non_en = (df_comments['detected_lang'] != 'en') & (df_comments['detected_lang'] != 'unknown')
coverage = (df_comments.loc[non_en, 'translated_text'] != df_comments.loc[non_en, 'comment_text']).mean()
print("Non-English rows:", int(non_en.sum()))
print("Translated fraction:", f"{coverage:.3f}")
print("\nTop detected languages:\n", df_comments['detected_lang'].value_counts().head(15))

Non-English rows: 85564
Translated fraction: 0.110

Top detected languages:
 detected_lang
en    314430
ja     12937
zh      8259
ar      7586
es      7367
ru      5779
pt      4609
ko      3045
fr      2751
tr      2594
la      2358
hi      1238
km      1164
mr      1153
fa       993
Name: count, dtype: int64


In [32]:
print(df_fnn.head())

                                                text  label
0  Did Miley Cyrus and Liam Hemsworth secretly ge...      1
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...      1
2  Celebrities Join Tax March in Protest of Donal...      1
3  Cindy Crawford's daughter Kaia Gerber wears a ...      1
4      Full List of 2018 Oscar Nominations ‚Äì Variety      1


## 4. Create Balanced Training Dataset

Finally, we'll combine and balance our training data to prepare it for model training.

In [None]:
# Combined LIAR datasets
df_liar = pd.concat([df_liar_train, df_liar_test, df_liar_val], ignore_index=True)

# Created binary labels (true/false) from LIAR's multi-class labels
df_liar['binary_label'] = df_liar['label'].apply(
    lambda x: 1 if str(x).strip().lower() in ['pants-fire', 'false', 'barely-true'] else 0
)

# Prepared LIAR portion
liar_training = pd.DataFrame({
    'text': df_liar['text'],
    'label': df_liar['binary_label']
}).dropna(subset=['text', 'label'])

try:
    fnn_training = df_fnn[['text', 'label']].copy()
except NameError:
    fnn_training = pd.DataFrame(columns=['text', 'label'])

# Combined all training data
training_data = pd.concat([liar_training, fnn_training], ignore_index=True)


# Dropped duplicates and rows with empty text
training_data['text'] = training_data['text'].astype(str).str.strip()
training_data = training_data[(training_data['text'] != '') & training_data['label'].isin([0, 1])]
training_data = training_data.drop_duplicates(subset=['text', 'label'])

print("Length of Training Data:", len(training_data))

MAX_TRAIN_ROWS = 100_000  
if len(training_data) > MAX_TRAIN_ROWS:
    training_data = training_data.sample(n=MAX_TRAIN_ROWS, random_state=42)

print("Training dataset size before balancing:", len(training_data))
print("Class distribution before balancing:\n", training_data['label'].value_counts())

# Balanced the dataset
min_class_size = int(training_data['label'].value_counts().min())
balanced_data = pd.concat([
    training_data[training_data['label'] == 0].sample(min_class_size, random_state=42),
    training_data[training_data['label'] == 1].sample(min_class_size, random_state=42)
], ignore_index=True)

print("Training dataset size after balancing:", len(balanced_data))
print("Class distribution after balancing:\n", balanced_data['label'].value_counts())

# Created stratified train/val/test splits for model training
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(balanced_data, test_size=0.15, stratify=balanced_data['label'], random_state=42)
train_df, test_df = train_test_split(training_data, test_size=0.15, stratify=training_data['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1765, stratify=train_df['label'], random_state=42)  # ~15% of original as val


print("Class distribution after balancing:\n", balanced_data['label'].value_counts())
print("Splits sizes: ", len(train_df), len(val_df), len(test_df))

# Saved processed data
balanced_data.to_csv(os.path.join(data_dir, 'balanced_data.csv'), index=False)
training_data.to_csv(os.path.join(data_dir, 'training_data.csv'), index=False)
train_df.to_csv(os.path.join(data_dir, 'training_data_train.csv'), index=False)
val_df.to_csv(os.path.join(data_dir, 'training_data_val.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'training_data_test.csv'), index=False)

df_comments.to_csv(os.path.join(data_dir, 'processed_comments.csv'), index=False)

print("\nData saved to:", data_dir)

Length of Training Data: 21853
Training dataset size before balancing: 21853
Class distribution before balancing:
 label
0    16530
1     5323
Name: count, dtype: int64
Training dataset size after balancing: 10646
Class distribution after balancing:
 label
0    5323
1    5323
Name: count, dtype: int64
Class distribution after balancing:
 label
0    5323
1    5323
Name: count, dtype: int64
Splits sizes:  15296 3279 3278

Data saved to: ../data


In [34]:
non_en = (df_comments['detected_lang'] != 'en') & (df_comments['detected_lang'] != 'unknown')
coverage = (df_comments.loc[non_en, 'translated_text'] != df_comments.loc[non_en, 'comment_text']).mean()
print("Non-English rows:", int(non_en.sum()))
print("Translated fraction:", f"{coverage:.3f}")
print("\nTop detected languages:\n", df_comments['detected_lang'].value_counts().head(15))

Non-English rows: 85564
Translated fraction: 0.110

Top detected languages:
 detected_lang
en    314430
ja     12937
zh      8259
ar      7586
es      7367
ru      5779
pt      4609
ko      3045
fr      2751
tr      2594
la      2358
hi      1238
km      1164
mr      1153
fa       993
Name: count, dtype: int64


In [35]:
failed = df_comments[non_en & (df_comments['translated_text'] == df_comments['comment_text'])]
print("Untranslated (non-English) rows:", len(failed))
print(failed[['detected_lang','comment_text']].head(10))

Untranslated (non-English) rows: 76116
    detected_lang                                       comment_text
11             zh             TRUMP MAGA MOVEMENT FOREVER ‚úùÔ∏èüá∫üá∏üáµüá™üçäüòéüòÅüëç
14             zh                                                 üñïüê∑
31             ko  ÎèÑÏò¨ÏÑ†ÏÉùÎãò! Ìï≠ÏÉÅ Í±¥Í∞ï ÌïòÏÖîÏÑú Íµ≠ÎØºÏùÑ Ìñ•Ìï¥ Î∞îÎ•∏ ÏÜåÎ¶¨Î•º Í≥ÑÏÜç Ïô∏Ï≥ê Ï£ºÏãúÍ∏∞ Î∞îÎûçÎãàÎã§....
39             ko  Íµ≠ÎØºÏù¥ ÏÑ†ÌÉù ÌñàÏäµÎãàÎã§ Ïù∏Ï†ïÌïòÍ≥† ÏßÄÍ∏àÎèÑ Î∞òÏÑ±ÏóÜÎäî ÌèêÍ±∞Î¶¨Ï†ïÏπò ÏÑ†Ï†ÑÏÑ†ÎèôÎßåÌïòÎäî Î∂ÅÌïúÏ†ïÏπòÎ•º ...
53             ur                                         üò≠üò≠üòåüòéüí©üí©üí©üí©üí©üí©
63             km                                                 üíúüíú
64             ne                                              üî•üî•üî•üî•üî´
87             ja                               ÊúÄÔºÜÈ´ò„Åß„Åô(*¬¥Ô∏∂`*)‚ô°Thanks!
127            uk  –û–π –±–ª—è –¥—É—Ä–∞–∫, —Ç–µ–±–µ –∂ –≤—Å–µ —Ä–∞–≤–Ω–æ –∂–æ–ø–∞....—Ä—É—Å–∫–∏—Ö ..

In [36]:
passed = df_comments[non_en & (df_comments['translated_text'] != df_comments['comment_text'])]
print("Translated successfully for non-English rows:", len(passed))
print(passed[['detected_lang','comment_text','translated_text']].head(10))

Translated successfully for non-English rows: 9448
   detected_lang                                       comment_text                                    translated_text
6             tr  Ke≈üke kiralara deƒüinselerdi adam ne g√ºzel dedi...  I wish they had mentioned the rents. The man s...
7             ja                                        „Å™„ÅúÂÄíÁΩÆÊ≥ï„Å´„Åó„Åü„Çì„Å†Ôºü              Why did you use the inversion method?
20            es                             Dios bendiga esta ni√±a                                God bless this girl
26            zh                                             zangoüéπ                                             camp üéπ
27            pt  Gosto muito dos v√≠deos de vcs. S√≥ vcs mesmo pr...  I really like your videos. Only you to make a ...
34            ja  ‰∏äÂ≥∂„Åï„Çì„ÅÆ„Åì„ÅÆÂíå„ÇÑ„Åã„Å™Á¥†Êïµ„Å™Á¨ëÈ°î„Çí„ÇÇ„ÅÜ„ÉÜ„É¨„Éì„ÅßË¶ã„Çã„Åì„Å®„ÅåÂá∫Êù•„Å™„ÅÑ„ÅÆ„ÅØÂ∞ë„ÅóÂØÇ„Åó„ÅÑ„Åß„Åô„Åå„ÄÅÊ≤¢Â±±„ÅÆ...  I'm a little sad that we won't be able t

In [37]:
non_en = df_comments['detected_lang'].ne('en') & df_comments['detected_lang'].ne('unknown')
translated_mask = df_comments.loc[non_en, 'translated_text'].ne(df_comments.loc[non_en, 'comment_text'])

translated_rows = translated_mask.sum()
total_non_en = non_en.sum()
success_rate = translated_rows / total_non_en if total_non_en else 0.0

print(f"Translated rows: {translated_rows:,} / {total_non_en:,} "
      f"({success_rate:.2%} success rate)")

Translated rows: 9,448 / 85,564 (11.04% success rate)
