# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

# Load Dataset

In [None]:
df = pd.read_csv("../dataset/TikTokEuropeanElections_Abortion_War.csv")

In [None]:
df.info()

# Sentiment Score By Country

## General Study

- **Count the amount of entries per country**

In [None]:
country_counts = df['countries'].value_counts()
print(country_counts)

- **Bootstrapped mean for compound sentiment score by country**

In [None]:
# Parameters
n_iterations = 1000
boot_data = []

# Group by 'countries'
for country, group in df.groupby('countries'):
    scores = group['sentiment_score_compound'].dropna()
    
    means = []
    for _ in range(n_iterations):
        sample = scores.sample(n=len(scores), replace=True)
        means.append(sample.mean())
    
    ci_lower = np.percentile(means, 2.5)
    ci_upper = np.percentile(means, 97.5)
    boot_data.append({
        'country': country,
        'mean': np.mean(means),
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    })

# Convert to DataFrame
boot_df = pd.DataFrame(boot_data)
boot_df.sort_values(by='mean', ascending=False, inplace=True)

# Plot
plot = boot_df.head(15)
plt.figure(figsize=(12, 6))
plt.bar(plot['country'], plot['mean'], yerr=[plot['mean'] - plot['ci_lower'], plot['ci_upper'] - plot['mean']],
        capsize=5, color='skyblue', edgecolor='black')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Bootstrapped Mean Sentiment Score')
plt.title('Mean Sentiment Score with 95% CI')
plt.tight_layout()
plt.grid(True)
plt.show()


- **Compound sentiment score distribution by country**

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='countries', y='sentiment_score_compound')
plt.xticks(rotation=45)
plt.title('Distribution of sentiment_score_compound by country')
plt.show()

- **Amount of Pos and Neg entries given the Binary Sentiment Score classification**

In [None]:
plt.figure(figsize=(14, 8))
sns.countplot(data=df, x='countries', hue='sentiment_score_binary')
plt.xticks(rotation=45)
plt.title('Distribution of Binary Sentiment Score per Country')
plt.show()

- **Ration Betweenn Pos and Neg entries per country**

In [None]:
counts = df.groupby(['countries', 'sentiment_score_binary']).size().unstack(fill_value=0)

counts['pos_neg_ratio'] = counts.get(1, 0) / counts.get(-1, 1)

print(counts[['pos_neg_ratio']])


- **Histogram of the sentiment score compound to see the entries intensity**

In [None]:
plot = sns.displot(df, x="sentiment_score_compound", col="countries", col_wrap=4, bins=20, kde=False, facet_kws={'sharex': True, 'sharey': True})
plot.fig.suptitle('Histogram of Sentiment Compound Score per Country', y=1.02)
plt.show()

- **Conclusions of this part**

- **NL**:

    - Intensity of sentiment score: Bimodal distribution with an enormous value on 1. Polarization towards positive content.
    - Positive/negative ratio: Extremly high (2.2), makes sense with the fact that most of entries are on 1.
    - Boxplot: Enormous median (~0.85) with a hight range.
    - Polarization: High

- **FR**:

    - Intensity of sentiment score: Splitted entries, while there is a majority of positive values the emotions aren't extreme.
    - Positive/negative ratio: High value (1.7) but reflect that there is more variety of emotions that on NL.
    - Boxplot: High median (~0.5) since there is more positive content but not as extreme as in NL.
    - Polarization: Average

- **ES**:

    - Intensity of sentiment score: Very polarized results, while positive content is nearly the double of the negative, most of the entries are on the extremes.
    - Positive/negative ratio: High value (1.7) but reflect that there is more variety of extreme emotions that on NL.
    - Boxplot: High median (~0.5) since there is more extreme content, both in negative and positive.
    - Polarization: High

- **PL**:

    - Intensity of sentiment score: Bimodal but with more splitted results.
    - Positive/negative ratio: Very low value (1.11) due to the fact that most of the neutral content is slightly negative.
    - Boxplot: High median (~0.4) showing a more symetric range showcasing the higher amount of neutral content.
    - Polarization: Low

- **DE**:

    - Intensity of sentiment score: High amount of negative content of all intesities and positive content centralized on the positive extreme.
    - Positive/negative ratio: Low value (1.14) since there are only extremly positive contents and an enormous amount of negative contents of all intensities.
    - Boxplot: High median (~0.2) showcasing the fact that there is a lot of negative content and very dispersed entries.
    - Polarization: Average

## Study by topic

- **Histogram of the sentiment score compound per topic**

In [None]:
topic_colors = {
    'abortion': 'purple',
    'war': 'green'
}

# Get countries and topics
countries = df['countries'].unique()
topics = df['topic'].unique()

# Create subplots
n_rows = len(countries)
n_cols = len(topics)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 20), sharex=True, sharey=True)

# Iterate through countries and topics
for i, country in enumerate(countries):
    for j, topic in enumerate(topics):

        subset = df[(df['countries'] == country) & (df['topic'] == topic)]
        ax = axes[i][j]    
        color = topic_colors.get(topic)
        sns.histplot(data=subset, x='sentiment_score_compound', bins=20, kde=False, ax=ax, color=color)

        ax.set_title(f"{country} - {topic}")
        ax.set_xlabel('Sentiment Score')
        ax.set_ylabel('Count')

sns.set(style="whitegrid")
plt.show()

- **% of extreme sentiment entries (1, -1) per country and topic**

In [None]:
# Compute total entries per country and topic
total_counts = df.groupby(['countries', 'topic']).size().reset_index(name='total_count')

# Compute strong positive entries (≥ 0.75)
strong_pos = df[df['sentiment_score_compound'] >= 0.75]
pos_counts = strong_pos.groupby(['countries', 'topic']).size().reset_index(name='pos_count')

# Compute strong negative entries (≤ -0.75)
strong_neg = df[df['sentiment_score_compound'] <= -0.75]
neg_counts = strong_neg.groupby(['countries', 'topic']).size().reset_index(name='neg_count')

# Merge all
merged = total_counts.merge(pos_counts, on=['countries', 'topic'], how='left')
merged = merged.merge(neg_counts, on=['countries', 'topic'], how='left')

# Fill NaNs with 0 (for groups that have only pos or only neg)
merged['pos_count'] = merged['pos_count'].fillna(0)
merged['neg_count'] = merged['neg_count'].fillna(0)

# Compute percentages
merged['pos_percent'] = 100 * merged['pos_count'] / merged['total_count']
merged['neg_percent'] = 100 * merged['neg_count'] / merged['total_count']

# Optional: sort by positive or negative percent
merged = merged.sort_values(by='pos_percent', ascending=False)

# Display result
print(merged[['countries', 'topic', 'pos_percent', 'neg_percent']])


- **Conclusions of this part**

Most of the extremly negative content (-1) is on War, in all countries the War topic is the one that polarizes the most the tone while on most countries the abortion topic has mixed sentiments with a spike on 1 score.

While the % of positive entries (from 0.75 to 1) remains between the War and the Abortion-related content, abortion-related content tends to have a lower amount of extreme negative content than war (except on Spain and Germany on which it remains very similar, this could be due to having more anti-abort movements or for the way people is). 

War evokes nearly the same amount of intense negative emotions accross countries being the most polarizating.

- **WAR**
    - Strongly negative emotions
    - High polarization
- **ABORTION**
    - Mixed-Positive emotions
    - Moderated polarization

# Political-Wing Analysis (Presence and topic relation)

- **% of right and left parties mentioned by country**

In [None]:
mean_ideology = df.groupby('countries')[['% left', '% right']].mean().reset_index()
print(mean_ideology)

- **Relate the % of the wings mentioned with the mean sentiment score isolated by ideology**

In [None]:
# Drop missing values
clean_df = df[['sentiment_score_compound', '% left', '% right']].dropna()

# Compute correlations
left_corr = clean_df['sentiment_score_compound'].corr(clean_df['% left'])
right_corr = clean_df['sentiment_score_compound'].corr(clean_df['% right'])

print(f"Correlation between sentiment score and % left: {left_corr:.3f}")
print(f"Correlation between sentiment score and % right: {right_corr:.3f}")


fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
# Left subplot: % Left
sns.regplot(data=clean_df, x='% left', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[0].set_title('Sentiment vs % Left')
axes[0].set_xlabel('% Left')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Right subplot: % Right
sns.regplot(data=clean_df, x='% right', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'green'})
axes[1].set_title('Sentiment vs % Right')
axes[1].set_xlabel('% Right')
axes[1].grid(True)

plt.tight_layout()
plt.show()

- **Sentiment Score for Abortion content grouped by political wings**

In [None]:
abortion_df = df[df['topic'].str.lower() == 'abortion']
abortion_clean = abortion_df[['sentiment_score_compound', '% right', '% left']].dropna()

# Compute correlations
corr_right = abortion_clean['sentiment_score_compound'].corr(abortion_clean['% right'])
corr_left = abortion_clean['sentiment_score_compound'].corr(abortion_clean['% left'])

print(f"Correlation between sentiment score and % right (abortion): {corr_right:.3f}")
print(f"Correlation between sentiment score and % left (abortion):  {corr_left:.3f}")

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for % right
sns.regplot(data=abortion_clean, x='% right', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'red'})
axes[0].set_title('Sentiment Score vs % Right (Abortion)')
axes[0].set_xlabel('% Right')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Plot for % left
sns.regplot(data=abortion_clean, x='% left', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[1].set_title('Sentiment Score vs % Left (Abortion)')
axes[1].set_xlabel('% Left')
axes[1].grid(True)

plt.tight_layout()
plt.show()


- **Sentiment Score for War content grouped by political wings**

In [None]:
war_df = df[df['topic'].str.lower() == 'war']
war_clean = war_df[['sentiment_score_compound', '% right', '% left']].dropna()

# Compute correlations
corr_right = war_clean['sentiment_score_compound'].corr(war_clean['% right'])
corr_left = war_clean['sentiment_score_compound'].corr(war_clean['% left'])

print(f"Correlation between sentiment score and % right (war): {corr_right:.3f}")
print(f"Correlation between sentiment score and % left (war):  {corr_left:.3f}")

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for % right
sns.regplot(data=war_clean, x='% right', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'red'})
axes[0].set_title('Sentiment Score vs % Right (War)')
axes[0].set_xlabel('% Right')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Plot for % left
sns.regplot(data=war_clean, x='% left', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[1].set_title('Sentiment Score vs % Left (War)')
axes[1].set_xlabel('% Left')
axes[1].grid(True)

plt.tight_layout()
plt.show()

- **Most mentioned political entities by country**

In [None]:
df['parties_mentioned'] = df['parties_mentioned'].apply(eval)

# Initialize result dictionary
top_parties_by_country = {}

# Loop through each country
for country in df['countries'].unique():
    country_df = df[df['countries'] == country]

    all_parties = sum(country_df['parties_mentioned'], [])  # flattens list of lists
    party_counts = Counter(all_parties)
    top_5 = party_counts.most_common(5)
    top_parties_by_country[country] = top_5

# Print results
for country, top_parties in top_parties_by_country.items():
    print(f"\nTop 5 parties in {country}:")
    for party, count in top_parties:
        print(f"  {party}: {count} mentions")

- **From which wing are the most mentioned parties?**

In [None]:
left = [
    "Die Linke", "IU", "Podemos", "PCE", "PCF", "LFI", "PRC", "SI", "Syriza",
    "BE", "Vänsterpartiet", "Vasemmistoliitto", "AKEL", "PTB", "KPÖ",
    "SP", "Enhedslisten", "Rødt", "PST/POP", "PIE", "The Left", "Razem", "EFA",
    "S&D", "Renew Europe", "PSOE", "Sumar", "PES", "PS", "APSD", "SD", "SAP",
    "Labour", "SPÖ", "Vooruit", "SPD", "NL", "PvdA", "Socialist Party",
    "Democratic Party", "Labour", "PASOK", "SLD", "Nouvelle Donne", "PRG",
    "Inicjatywa Polska", "Grüne", "Greens"
]

right = [
    "EPP", "ECR", "PiS", "VOX", "ID", "RN", "Lega", "FPÖ", "Fidesz", "Patriots", 
    "ESN", "AfD", "Republika", "Reconquête", "NOWA NADZIEJA", "Mi Hazánk",
    "PP", "Partido Popular", "CDU", "Agir", "MoDem", "Ensemble", "LFA", "RE",
    "LR", "CDA", "NSC", "IDP", "CSU", "FDP", "FW", "Junts", "ZP", "NPD", "PVV",
    "FvD", "European People's Party", "Progressive Alliance of Socialists & Democrats"
]

In [None]:
all_parties = sum(df['parties_mentioned'], [])  # flattens list of lists
party_counts = Counter(all_parties)

# Get the 20 most mentioned parties
top_20_parties = party_counts.most_common(20)

# Initialize dictionaries to store the country with the most mentions and the political wing
party_country_dict = {}
party_wing_dict = {}

# Loop through each country to find where each party was mentioned the most
for country in df['countries'].unique():
    country_df = df[df['countries'] == country]
    country_parties = sum(country_df['parties_mentioned'], [])
    country_party_counts = Counter(country_parties)
    
    for party in country_party_counts:
        # If the party is in the top 20, update the country with the most mentions for that party
        if party in dict(top_20_parties):
            if party not in party_country_dict or country_party_counts[party] > party_country_dict[party][1]:
                party_country_dict[party] = (country, country_party_counts[party])
                
                # Determine the political wing based on the lists
                if party in left:
                    party_wing_dict[party] = 'left'
                elif party in right:
                    party_wing_dict[party] = 'right'
                else:
                    party_wing_dict[party] = 'Unknown'  # If the party is not in either list

# Print the top 20 parties, their mentions, political wing, and the country that named them the most
print("Top 20 Most Mentioned Political Parties:")

for party, count in top_20_parties:
    most_mentioned_country, mentions_in_country = party_country_dict[party]
    political_wing = party_wing_dict.get(party, 'Unknown')  # Default to 'Unknown' if no wing found
    print(f"{party} ({political_wing}): {count} mentions, most mentioned in {most_mentioned_country} ({mentions_in_country} mentions)")