In [14]:
%pip install yt-dlp pandas tqdm nltk spacy emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m6.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.8 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
import pandas as pd
import re
import spacy
from tqdm import tqdm
import emoji

In [20]:
INPUT_FILE = "youtube_comments.csv"
OUTPUT_FILE = "youtube_comments_preprocessed.csv"

In [21]:
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} comments.")

Loaded 77912 comments.


In [22]:
df["comment"] = df["comment"].astype(str)

In [23]:
def extract_emojis(text):
    """Extract all emojis from the comment."""
    return "".join(ch for ch in text if ch in emoji.EMOJI_DATA)

def remove_emojis(text):
    """Remove emojis from the comment."""
    return emoji.replace_emoji(text, replace='')

In [24]:
print("Extracting emojis...")
tqdm.pandas()
df["emojis"] = df["comment"].progress_apply(extract_emojis)
df["has_emoji"] = df["emojis"].progress_apply(lambda x: len(x) > 0)
df["comment_noemoji"] = df["comment"].progress_apply(remove_emojis)

Extracting emojis...


100%|██████████| 77912/77912 [00:00<00:00, 155971.03it/s]
100%|██████████| 77912/77912 [00:00<00:00, 1461321.74it/s]
100%|██████████| 77912/77912 [00:10<00:00, 7140.79it/s] 


In [25]:
def clean_comment(text):
    """
    Lowercase, remove URLs, punctuation, extra spaces.
    Keeps negations ('not', 'no').
    """
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)      # remove URLs
    text = re.sub(r"[^a-z\s']", " ", text)          # keep only letters and apostrophes
    text = re.sub(r"\s+", " ", text).strip()        # collapse multiple spaces
    return text

In [26]:
print("Cleaning text...")
df["clean_comment"] = df["comment_noemoji"].progress_apply(clean_comment)

Cleaning text...


100%|██████████| 77912/77912 [00:00<00:00, 82493.71it/s]


In [27]:
print("Loading spaCy model (en_core_web_sm)...")
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

Loading spaCy model (en_core_web_sm)...


In [28]:

def lemmatize_comment(text):
    """Lemmatize words, remove stopwords and short tokens."""
    doc = nlp(text)
    lemmas = [
        token.lemma_ for token in doc
        if not token.is_stop and len(token) > 2
    ]
    return " ".join(lemmas)

In [29]:
print("Lemmatizing comments (this may take a few minutes)...")
df["lemma_comment"] = df["clean_comment"].progress_apply(lemmatize_comment)

Lemmatizing comments (this may take a few minutes)...


100%|██████████| 77912/77912 [03:24<00:00, 380.70it/s]


In [32]:
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8", lineterminator="\n")
print(f"Preprocessing complete - saved to {OUTPUT_FILE}")
print(df.head(3))

Preprocessing complete - saved to youtube_comments_preprocessed.csv
            id                                        video_title  \
0  ttjz6pax5A8  Trump reportedly rejected Zelenskyy's request ...   
1  ttjz6pax5A8  Trump reportedly rejected Zelenskyy's request ...   
2  ttjz6pax5A8  Trump reportedly rejected Zelenskyy's request ...   

                                             comment  \
0  When it's TACOS turn NOBODY WILL BE ON HIS SID...   
1  Why is Rachel on tv with that voice. Who does ...   
2       Трамп навсегда останется марионеткой Путина!   

                                           url emojis  has_emoji  \
0  https://www.youtube.com/watch?v=ttjz6pax5A8             False   
1  https://www.youtube.com/watch?v=ttjz6pax5A8             False   
2  https://www.youtube.com/watch?v=ttjz6pax5A8             False   

                                     comment_noemoji  \
0  When it's TACOS turn NOBODY WILL BE ON HIS SID...   
1  Why is Rachel on tv with that voice. Who d