In [1]:
!pip install pandas tqdm spacy langdetect nltk --quiet
!python -m spacy download en_core_web_sm --quiet

[33m  DEPRECATION: Building 'langdetect' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'langdetect'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
import pandas as pd
import re
import emoji
import unicodedata
from tqdm import tqdm
from langdetect import detect
import spacy

In [10]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

tqdm.pandas()

In [None]:
def normalize_unicode(text: str) -> str:
    """Normalize Unicode and remove invisible / separator characters."""
    text = str(text)
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("\u2028", " ").replace("\u2029", " ")
    text = text.replace("\u200d", "").replace("\u200c", "").replace("\u200b", "")
    return text


def extract_emojis(text: str) -> str:
    """Extract all emojis from text."""
    return "".join(ch for ch in str(text) if ch in emoji.EMOJI_DATA)


def remove_emojis(text: str) -> str:
    """Remove all emojis from text."""
    return emoji.replace_emoji(str(text), replace='')


def clean_text(text: str) -> str:
    """Lowercase, remove URLs, punctuation, extra spaces; keep negations."""
    text = normalize_unicode(text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = text.lower()
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def detect_lang_safe(text: str) -> str:
    """Detect language; safely return 'unk' if detection fails."""
    try:
        return detect(text)
    except Exception:
        return "unk"


def lemmatize_en(text: str) -> str:
    """Lemmatize English text, remove stopwords and short tokens."""
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and len(token) > 2]
    return " ".join(lemmas)


In [12]:
INPUT_FILE = "youtube_comments.csv"
OUTPUT_FILE = "youtube_comments_english.csv"

df = pd.read_csv(INPUT_FILE, dtype=str).fillna("")
print(f"Loaded {len(df)} comments from {INPUT_FILE}")

Loaded 77912 comments from youtube_comments.csv


In [13]:
df["comment"] = df["comment"].astype(str).apply(normalize_unicode)
df["emojis"] = df["comment"].apply(extract_emojis)
df["has_emoji"] = df["emojis"].apply(lambda x: len(x) > 0)
df["comment_noemoji"] = df["comment"].apply(remove_emojis)
df["clean_comment"] = df["comment_noemoji"].apply(clean_text)

In [14]:
df["lang"] = df["clean_comment"].progress_apply(detect_lang_safe)
df_en = df[df["lang"] == "en"].copy()
print(f"Kept {len(df_en)} English comments out of {len(df)} total")

100%|██████████| 77912/77912 [02:38<00:00, 491.08it/s]


Kept 68557 English comments out of 77912 total


In [15]:
df_en["lemma_comment"] = df_en["clean_comment"].progress_apply(lemmatize_en)

100%|██████████| 68557/68557 [02:56<00:00, 388.46it/s]


In [16]:
df_en.to_csv(
    OUTPUT_FILE,
    index=False,
    encoding="utf-8",
    lineterminator="\n"
)

print(f"Saved clean English-only dataset to {OUTPUT_FILE}")
df_en.head(5)

Saved clean English-only dataset to youtube_comments_english.csv


Unnamed: 0,id,video_title,comment,url,emojis,has_emoji,comment_noemoji,clean_comment,lang,lemma_comment
0,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,When it's TACOS turn NOBODY WILL BE ON HIS SID...,https://www.youtube.com/watch?v=ttjz6pax5A8,,False,When it's TACOS turn NOBODY WILL BE ON HIS SID...,when it's tacos turn nobody will be on his sid...,en,taco turn foxfakebabble come go commit crime f...
1,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,Why is Rachel on tv with that voice. Who does ...,https://www.youtube.com/watch?v=ttjz6pax5A8,,False,Why is Rachel on tv with that voice. Who does ...,why is rachel on tv with that voice who does s...,en,rachel voice know
3,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,"Ukraine , DO NOT TRUST TRUMP He is a Putin pup...",https://www.youtube.com/watch?v=ttjz6pax5A8,,False,"Ukraine , DO NOT TRUST TRUMP He is a Putin pup...",ukraine do not trust trump he is a putin puppet,en,ukraine trust trump putin puppet
4,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,Ukraine already has Storm Shadow although has ...,https://www.youtube.com/watch?v=ttjz6pax5A8,,False,Ukraine already has Storm Shadow although has ...,ukraine already has storm shadow although has ...,en,ukraine storm shadow range tomehawk soon chang...
5,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,"Putin still playing trump, the fact that Trump...",https://www.youtube.com/watch?v=ttjz6pax5A8,,False,"Putin still playing trump, the fact that Trump...",putin still playing trump the fact that trump ...,en,putin play trump fact trump soft putin crazy


In [18]:
def fix_newlines(text: str) -> str:
    """Replace linebreaks & separators with spaces (single-line CSV fields)."""
    return str(text).replace("\r", " ").replace("\n", " ").replace("\u2028", " ").replace("\u2029", " ")

for col in ["comment", "lemma_comment"]:
    if col in df_en.columns:
        df_en[col] = df_en[col].apply(fix_newlines)

In [19]:
final_cols = ["id", "video_title", "url", "comment", "lemma_comment"]
df_final = df_en[final_cols].copy()

In [20]:
df_final.to_csv(
    OUTPUT_FILE,
    index=False,
    encoding="utf-8",
    lineterminator="\n",
    quoting=csv.QUOTE_ALL,
    escapechar="\\"
)

print(f"Saved final cleaned dataset: {OUTPUT_FILE}")
print(f"Rows: {len(df_final)} | Columns: {list(df_final.columns)}")

Saved final cleaned dataset: youtube_comments_english.csv
Rows: 68557 | Columns: ['id', 'video_title', 'url', 'comment', 'lemma_comment']


In [21]:
df_final.head(5)

Unnamed: 0,id,video_title,url,comment,lemma_comment
0,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,https://www.youtube.com/watch?v=ttjz6pax5A8,When it's TACOS turn NOBODY WILL BE ON HIS SID...,taco turn foxfakebabble come go commit crime f...
1,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,https://www.youtube.com/watch?v=ttjz6pax5A8,Why is Rachel on tv with that voice. Who does ...,rachel voice know
3,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,https://www.youtube.com/watch?v=ttjz6pax5A8,"Ukraine , DO NOT TRUST TRUMP He is a Putin pup...",ukraine trust trump putin puppet
4,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,https://www.youtube.com/watch?v=ttjz6pax5A8,Ukraine already has Storm Shadow although has ...,ukraine storm shadow range tomehawk soon chang...
5,ttjz6pax5A8,Trump reportedly rejected Zelenskyy's request ...,https://www.youtube.com/watch?v=ttjz6pax5A8,"Putin still playing trump, the fact that Trump...",putin play trump fact trump soft putin crazy
