In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os


In [4]:
input_csv = "sentiment_analysis.csv"  # path to results CSV
# === LOAD DATA ===
df = pd.read_csv(input_csv)
df.head()

Unnamed: 0,video_link,gender,news,video_id,video_name,channel_name,subscribers,views,comments,likes,comment_text,comment_author,comment_likes,comment_date,sentiment,confidence
0,https://www.youtube.com/watch?v=7LVSrTZDopM,m,n,7LVSrTZDopM,"Climate ""Science"" | Dr. Richard Lindzen | EP 320",Jordan B Peterson,8830000,1432756,8332,39968,Note the Climate Change propaganda that YouTub...,@JordanBPeterson,5648,2023-01-08T09:52:52Z,negative,0.835941
1,https://www.youtube.com/watch?v=7LVSrTZDopM,m,n,7LVSrTZDopM,"Climate ""Science"" | Dr. Richard Lindzen | EP 320",Jordan B Peterson,8830000,1432756,8332,39968,"Thank you Dr Peterson for inviting Dr Lindzen,...",@DanielleDurand-q6l,1,2025-10-17T11:56:05Z,positive,0.931553
2,https://www.youtube.com/watch?v=7LVSrTZDopM,m,n,7LVSrTZDopM,"Climate ""Science"" | Dr. Richard Lindzen | EP 320",Jordan B Peterson,8830000,1432756,8332,39968,Dr Lindzin is stepping straight into the mista...,@riaanswiegers6628,0,2025-10-05T05:32:16Z,negative,0.68161
3,https://www.youtube.com/watch?v=7LVSrTZDopM,m,n,7LVSrTZDopM,"Climate ""Science"" | Dr. Richard Lindzen | EP 320",Jordan B Peterson,8830000,1432756,8332,39968,"Thank you! I'm considered a ""science denier"" ...",@flameofthewest6196,3,2025-09-22T19:57:39Z,neutral,0.552359
4,https://www.youtube.com/watch?v=7LVSrTZDopM,m,n,7LVSrTZDopM,"Climate ""Science"" | Dr. Richard Lindzen | EP 320",Jordan B Peterson,8830000,1432756,8332,39968,Fairies will always make more money than scien...,@dickbrmly,1,2025-09-17T12:39:56Z,negative,0.484183


In [13]:
# === MAP SENTIMENT TO NUMERIC VALUES ===
sentiment_map = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}
df["sentiment_score"] = df["sentiment"].map(sentiment_map)

# === COMPUTE AVERAGE SENTIMENT PER VIDEO ===
avg_sentiment = df.groupby("video_id").agg({
    "sentiment_score": "mean",
    "gender": "first",
    "news": "first",
    "channel_name": "first",
    "video_link": "first",
    "comments": "count"
}).reset_index()

avg_sentiment.rename(columns={"comments": "number_of_comments"}, inplace=True)

avg_sentiment.head()

Unnamed: 0,video_id,sentiment_score,gender,news,channel_name,video_link,number_of_comments
0,0FmHqjx6ESQ,-0.516,m,n,JRE Clips,https://www.youtube.com/watch?v=0FmHqjx6ESQ,1000
1,15-QlH_VDaM,-0.340741,f,n,Bridget Phetasy,https://www.youtube.com/watch?v=15-QlH_VDaM,270
2,1LAyILmn2Hk,-0.162088,f,n,GEO GIRL,https://www.youtube.com/watch?v=1LAyILmn2Hk,364
3,1dRgCsZ1q7g,-0.166,m,n,vlogbrothers,https://www.youtube.com/watch?v=1dRgCsZ1q7g,1000
4,2W3pxOYrxVw,-0.048295,m,n,Jordan B Peterson,https://www.youtube.com/watch?v=2W3pxOYrxVw,704


In [23]:
female_count = (avg_sentiment['gender'] == 'f').sum()
print(f"Total rows with gender = 'f': {female_count}")

news_count = (avg_sentiment['news'] == 'y').sum()
print(f"Total rows with news = 'y': {news_count}")

print(f"Total rows: {avg_sentiment['video_id'].count()}")

Total rows with gender = 'f': 24
Total rows with news = 'y': 10
Total rows: 88


In [None]:
avg_sentiment = avg_sentiment.sort_values("sentiment_score", ascending=False)

top_positive = avg_sentiment.sort_values("sentiment_score", ascending=False).head(22)

topq2_positive = avg_sentiment.sort_values("sentiment_score", ascending=False).head(44)

# Remove rows from topq2_positive that are in top_positive
q2 = topq2_positive[~topq2_positive.index.isin(top_positive.index)]

topq3_positive = avg_sentiment.sort_values("sentiment_score", ascending=False).head(66)

q3 = topq3_positive[~topq3_positive.index.isin(topq2_positive.index)]

q4 = avg_sentiment[~avg_sentiment.index.isin(topq3_positive.index)]

female_count = (q4['gender'] == 'f').sum()

female_count

np.int64(2)

## per video plot

In [None]:
def individual_video_plots(df, output_dir):
    # Basic sanity check (require gender column)
    expected_cols = {"video_id", "comments", "sentiment", "confidence", "gender"}
    if not expected_cols.issubset(df.columns):
        raise ValueError(f"CSV must contain columns: {expected_cols}")

    # === PREP OUTPUT DIR ===
    os.makedirs(output_dir, exist_ok=True)

    # precompute one gender value per video_id
    genders = df.groupby("video_id")["gender"].first().astype(str).fillna("")

    # === GROUP AND PLOT ===
    # Count each sentiment per video
    grouped = df.groupby(["video_id", "sentiment"]).size().unstack(fill_value=0)

    # Ensure all three sentiment categories exist
    for sentiment in ["positive", "neutral", "negative"]:
        if sentiment not in grouped.columns:
            grouped[sentiment] = 0

    # Sort columns in consistent order
    grouped = grouped[["positive", "neutral", "negative"]]

    # Loop through each video and plot
    chart_count = 0
    for video_id, row in grouped.iterrows():
        counts = row.values
        sentiments = row.index

        fig, ax = plt.subplots(figsize=(6, 4))
        ax.bar(sentiments, counts, color=["green", "gray", "red"])
        ax.set_title(f"Sentiment Distribution for Video:\n{video_id}")
        ax.set_xlabel("Sentiment")
        ax.set_ylabel("Number of Comments")
        plt.tight_layout()

        # annotate chart with the video's gender (top-right corner)
        gender_label = genders.get(video_id, "")
        if gender_label:
            ax.text(
                0.98,
                0.98,
                f"gender: {gender_label}",
                transform=ax.transAxes,
                ha="right",
                va="top",
                fontsize=9,
                bbox=dict(boxstyle="round,pad=0.2", fc="white", alpha=0.7, edgecolor="none"),
            )

        # Replace problematic characters in filenames
        safe_name = "".join(c if c.isalnum() else "_" for c in video_id)
        fig.savefig(os.path.join(output_dir, f"{safe_name}.png"), dpi=150)
        plt.close(fig)

        chart_count += 1
        print(f"ðŸ“Š Chart {chart_count} saved: {safe_name}.png")

    print(f"âœ… Total charts saved: {chart_count} to {output_dir}")

individual_video_plots(df, 'video_sentiment_charts')

## summary plot

In [56]:
def summary_plots(avg_sentiment, output_dir):
    # avg_sentiment: DataFrame with columns ['video_id','sentiment_score','gender', 'number_of_comments', ...]
    os.makedirs(output_dir, exist_ok=True)

    x = avg_sentiment["video_id"].astype(str)
    heights = avg_sentiment["sentiment_score"]
    
    # Map gender to colors
    gender_color_map = {"f": "pink", "m": "lightblue"}
    colors = [gender_color_map.get(str(g).strip(), "gray") for g in avg_sentiment["gender"]]

    fig, ax = plt.subplots(figsize=(12, 5))
    bars = ax.bar(x, heights, color=colors)

    # annotate each bar with the corresponding gender value and number of comments
    genders = avg_sentiment["gender"].astype(str).fillna("")
    comments = avg_sentiment["number_of_comments"].astype(int)
    
    for bar, gender, comment_count in zip(bars, genders, comments):
        height = bar.get_height()
        offset = 4 if height >= 0 else -6
        va = "bottom" if height >= 0 else "top"
        
        # Annotate with both gender and comment count
        label_text = f"{gender}\n({comment_count})"
        ax.annotate(
            label_text,
            (bar.get_x() + bar.get_width() / 2, height),
            xytext=(0, offset),
            textcoords="offset points",
            ha="center",
            va=va,
            fontsize=8,
        )

    ax.set_title("Average Sentiment per Video")
    ax.set_xlabel("Video ID")
    ax.set_ylabel("Average Sentiment Score (-1 = Neg, 0 = Neutral, 1 = Pos)")
    plt.xticks(rotation=45, ha="right")
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor="pink", label="Female (f)"),
        Patch(facecolor="lightblue", label="Male (m)"),
        Patch(facecolor="gray", label="Other/Unknown")
    ]
    ax.legend(handles=legend_elements, loc="upper right")
    
    plt.tight_layout()

    output_path = os.path.join(output_dir, "average_sentiment_per_video.png")
    fig.savefig(output_path, dpi=150)
    plt.close(fig)

    print(f"âœ… Saved average sentiment chart to: {output_path}")

summary_plots(avg_sentiment, 'summary_plots')

âœ… Saved average sentiment chart to: summary_plots\average_sentiment_per_video.png
