In [None]:
%pip install -q \
    torch torchvision torchaudio \
    transformers \
    pandas numpy \
    seaborn matplotlib \
    wordcloud tqdm \
    sentencepiece \
    scikit-learn

In [None]:
import torch
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

In [None]:
plt.style.use("seaborn-v0_8-whitegrid")

In [None]:
DATA_PATH = "youtube_comments_english.csv"

In [None]:
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df):,} comments")
print("Columns:", list(df.columns))

display(df.head(3))

In [None]:
sentiment_model = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=device
)

In [None]:
batch_size = 64
sentiments = []

In [None]:
print("\nRunning sentiment analysis...")
texts = df["lemma_comment"].astype(str).tolist()

for i in tqdm(range(0, len(texts), batch_size), desc="Sentiment Analysis", ncols=100):
    batch = texts[i:i + batch_size]
    results = sentiment_model(batch)
    sentiments.extend([r["label"] for r in results])

df["sentiment_roberta"] = sentiments

In [None]:
labels = ["pro-Ukraine", "pro-Russia", "neutral"]

In [None]:
stance_model = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
    device=device
)

In [None]:
batch_size = 16
stances = []

print("\nRunning stance classification...")
for i in tqdm(range(0, len(texts), batch_size), desc="Stance Analysis", ncols=100):
    batch = texts[i:i + batch_size]
    batch_results = stance_model(batch, candidate_labels=labels, multi_label=False, truncation=True)
    stances.extend([r["labels"][0] for r in batch_results])

df["stance"] = stances

In [None]:
df.to_csv("youtube_comments_with_sentiment_stance.csv", index=False, encoding="utf-8")
print("\nAnalysis complete! Saved to youtube_comments_with_sentiment_stance.csv")

In [None]:
df[["lemma_comment", "sentiment_roberta", "stance"]].head(10)

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(
    data=df,
    x="sentiment_roberta",
    order=["negative", "neutral", "positive"],
    palette="coolwarm"
)
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Comments")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,4))
order = df["stance"].value_counts().index
sns.countplot(
    data=df,
    x="stance",
    order=order,
    palette="Set2"
)
plt.title("Stance Distribution")
plt.xlabel("Stance Label")
plt.ylabel("Number of Comments")
plt.tight_layout()
plt.show()

In [None]:
ct = pd.crosstab(df["stance"], df["sentiment_roberta"], normalize="index")
plt.figure(figsize=(8,5))
sns.heatmap(ct, annot=True, cmap="YlGnBu", fmt=".2f")
plt.title("Sentiment Composition within Each Stance")
plt.xlabel("Sentiment")
plt.ylabel("Stance")
plt.tight_layout()
plt.show()

In [None]:
sentiment_colors = {
    "positive": "Greens",
    "neutral": "Greys",
    "negative": "Reds"
}

for sentiment, cmap in sentiment_colors.items():
    text = " ".join(df.loc[df["sentiment_roberta"] == sentiment, "lemma_comment"].dropna().astype(str))
    if text.strip():
        wc = WordCloud(
            width=800, height=400, background_color="white",
            colormap=cmap, max_words=150, collocations=False
        ).generate(text)
        plt.figure(figsize=(10,5))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"Most Frequent Words in {sentiment.capitalize()} Comments")
        plt.tight_layout()
        plt.show()