# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from scipy.stats import pearsonr
import ast
from matplotlib.patches import Patch
from sklearn.preprocessing import MinMaxScaler

# Load Dataset

In [None]:
df = pd.read_csv("../dataset/Final_TikTok.csv")

In [None]:
df.info()

# Sentiment Score By Country

## General Study

- **Mean General Sentiment Score Compound** 

In [None]:
# Defining the hyperparameters
n_iterations = 1000
sample_size = len(df)
boot_means = []

# Bootstrapping loop
for i in range(n_iterations):
    sample = df.sample(n=sample_size, replace=True)
    mean = sample['sentiment_score_compound'].mean()
    boot_means.append(mean)

# Calculating confidence interval
ci_lower = np.percentile(boot_means, 2.5)
ci_upper = np.percentile(boot_means, 97.5)

# The mean of the whole population is between this two points
print(f"Bootstrapped 95% CI for 'sentiment_score': [{ci_lower:.3f}, {ci_upper:.3f}]")
sns.histplot(boot_means, bins=50, kde=True, color='skyblue')
plt.axvline(ci_lower, color='red', linestyle='--', label=f'2.5% ({ci_lower:.3f})')
plt.axvline(ci_upper, color='red', linestyle='--', label=f'97.5% ({ci_upper:.3f})')
plt.axvline(np.mean(boot_means), color='green', linestyle='-', label=f'Mean ({np.mean(boot_means):.3f})')
plt.title("Bootstrapped means of compound sentiment score")
plt.xlabel("Mean compound sentiment score")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()


- **Count the amount of entries per country**

In [None]:
country_counts = df['countries'].value_counts()
print(country_counts)

- **Bootstrapped mean for compound sentiment score by country**

In [None]:
df['countries'] = df['countries'].astype(str).str.extract(r"'(\w+)'")[0]

In [None]:
color_map = {
    'Spain': "#FD6A6A",       
    'Netherlands': "#854CB3",
    'France': "#4769FF",       
    'Poland': "#319B3F",      
    'Germany': "#E9D045"      
}

country_map = {
    'es': 'Spain',
    'nl': 'Netherlands',
    'fr': 'France',
    'pl': 'Poland',
    'de': 'Germany'
}

In [None]:
ordered_countries = ['Netherlands', 'Poland', 'France', 'Germany', 'Spain']

In [None]:
n_iterations = 1000
boot_data = []

for country_code, group in df.groupby('countries'):
    scores = group['sentiment_score_compound'].dropna()
    means = [scores.sample(n=len(scores), replace=True).mean() for _ in range(n_iterations)]
    
    boot_data.append({
        'country_code': country_code,
        'country': country_map.get(country_code, country_code),
        'mean': np.mean(means),
        'ci_lower': np.percentile(means, 2.5),
        'ci_upper': np.percentile(means, 97.5)
    })

boot_df = pd.DataFrame(boot_data)
boot_df = boot_df.set_index('country').loc[ordered_countries].reset_index()
bar_colors = [color_map[country] for country in boot_df['country']]

plt.figure(figsize=(12, 6))
plt.bar(
    boot_df['country'],
    boot_df['mean'],
    yerr=[boot_df['mean'] - boot_df['ci_lower'], boot_df['ci_upper'] - boot_df['mean']],
    capsize=5,
    color=bar_colors,
    edgecolor='black'
)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Bootstrapped mean compound sentiment score distribution')
plt.title('Mean compound sentiment score country-wise with 95% CI')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


- **Compound sentiment score distribution by country**

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='countries', y='sentiment_score_compound')
plt.xticks(rotation=45)
plt.title('Distribution of sentiment_score_compound by country')
plt.show()
medians = df.groupby('countries')['sentiment_score_compound'].median().sort_values(ascending=False)
print(medians)

- **Amount of Pos and Neg entries given the Binary Sentiment Score classification**

In [None]:
plt.figure(figsize=(14, 8))
sns.countplot(data=df, x='countries', hue='sentiment_score_binary')
plt.xticks(rotation=45)
plt.title('Distribution of Binary Sentiment Score per Country')
plt.show()

- **Ration Betweenn Pos and Neg entries per country**

In [None]:
counts = df.groupby(['countries', 'sentiment_score_binary']).size().unstack(fill_value=0)
counts['pos_neg_ratio'] = counts.get(1, 0) / counts.get(-1, 1)
print(counts[['pos_neg_ratio']])


- **Histogram of the sentiment score compound to see the entries intensity**

In [None]:
df['country'] = df['countries'].map(country_map)

bins = 20
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12))
axes = axes.flatten()

# Plot each country
for i, country in enumerate(df['country'].unique()):
    ax = axes[i]
    data = df[df['country'] == country]['sentiment_score_compound'].dropna()
    
    counts, bin_edges = np.histogram(data, bins=bins)
    counts = counts / counts.sum()
    bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
    ax.bar(bin_centers, counts, width=(bin_edges[1] - bin_edges[0]), 
           color=color_map[country], edgecolor='black')
    
    ax.set_title(country)
    ax.set_ylim(0, 0.4)
    ax.set_ylabel("Proportion")
    ax.set_xlabel("Sentiment Compound Score")
    ax.grid(True, linestyle='--', alpha=0.5)

# Remove any unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Normalized histogram of compound sentiment score per country", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


In [None]:
total_counts = df['countries'].value_counts()
extreme_df = df[(df['sentiment_score_compound'] > 0.75) | (df['sentiment_score_compound'] < -0.75)]
extreme_counts = extreme_df['countries'].value_counts()
extreme_ratios = (extreme_counts / total_counts).fillna(0)
extreme_ratios = extreme_ratios.sort_values(ascending=False)
print(extreme_ratios)

In [None]:
df_filtered = df.dropna(subset=['sentiment_score_compound'])
data = []

for topic in ['war', 'abortion']:
    topic_df = df_filtered[df_filtered['topic'] == topic]
    total_counts = topic_df['countries'].value_counts()
    positive_counts = topic_df[topic_df['pos'] > 0]['countries'].value_counts()
    extreme_counts = topic_df[
        (topic_df['sentiment_score_compound'] > 0.75) | 
        (topic_df['sentiment_score_compound'] < -0.75)
    ]['countries'].value_counts()
    
    proportion_df = pd.DataFrame({
        'positive_language': positive_counts / total_counts,
        'extreme_sentiment': extreme_counts / total_counts
    }).fillna(0)
    
    proportion_df['country'] = proportion_df.index.map(lambda code: country_map.get(code, code))
    proportion_df = proportion_df.dropna(subset=['country'])
    proportion_df['topic'] = topic
    
    data.append(proportion_df)

combined_df = pd.concat(data)
corr_value, p_value = pearsonr(
    combined_df['positive_language'], 
    combined_df['extreme_sentiment']
)

plt.figure(figsize=(10, 7))

for _, row in combined_df.iterrows():
    label = f"{row['country']} - {row['topic']}"
    marker = 'o' if row['topic'] == 'war' else 's'
    plt.scatter(
        row['positive_language'],
        row['extreme_sentiment'],
        color=color_map[row['country']],
        label=label,
        s=100,
        edgecolor='k',
        marker=marker
    )

sns.regplot(
    data=combined_df,
    x='positive_language',
    y='extreme_sentiment',
    scatter=False,
    color='black',
    line_kws={'linestyle': '--'}
)

country_handles = [Patch(color=color_map[c], label=c) for c in combined_df['country'].unique()]
topic_handles = [
    plt.Line2D([0], [0], marker='o', color='w', label='war', markerfacecolor='grey', markersize=10, markeredgecolor='k'),
    plt.Line2D([0], [0], marker='s', color='w', label='abortion', markerfacecolor='grey', markersize=10, markeredgecolor='k'),
]

legend1 = plt.legend(handles=country_handles, title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
legend2 = plt.legend(handles=topic_handles, title='Topic', bbox_to_anchor=(1.05, 0.7), loc='upper left')
plt.gca().add_artist(legend1)

plt.text(
    0.05, 0.95, 
    f'Pearson r = {corr_value:.2f}\nP-value = {p_value:.3g}',
    transform=plt.gca().transAxes,
    verticalalignment='top',
    bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8)
)

plt.xlabel('Proportion of positive-language entries')
plt.ylabel('Proportion of extreme sentiment entries')
plt.title('Proportion of positive-language vs. extreme sentiment per country and topic')
plt.xlim(0.8, 1)
plt.ylim(0.4, 0.8)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

- **Histogram of subjetivity score by country**

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8), constrained_layout=True)
axes = axes.flatten()

countries_order = ['nl', 'pl', 'fr', 'de', 'es']

for i, country_code in enumerate(countries_order):
    country_name = country_map.get(country_code, country_code)
    country_data = df[df['countries'] == country_code]['subjectivity_score']

    counts, bins, patches = axes[i].hist(
        country_data,
        bins=20,
        color=color_map[country_name],
        edgecolor='black',
        alpha=0.7
    )
    
    counts_normalized = counts / counts.max()
    for count, patch in zip(counts_normalized, patches):
        patch.set_height(count)

    mean_val = country_data.mean()
    axes[i].set_title(f"{country_name}")
    axes[i].set_xlabel('Subjectivity Score')
    axes[i].set_ylabel('Normalized Frequency')
    axes[i].set_xlim(0, 1)  # fix x-axis range
    axes[i].set_ylim(0, 1.05)
    axes[i].text(
        0.95, 0.85,
        f"Avg: {mean_val:.3f}",
        horizontalalignment='right',
        verticalalignment='center',
        transform=axes[i].transAxes,
        fontsize=12,
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none')
    )

if len(countries_order) < len(axes):
    for j in range(len(countries_order), len(axes)):
        fig.delaxes(axes[j])

plt.suptitle('Subjectivity Score Distribution per Country (Max Frequency Normalized)', fontsize=16)
plt.show()


- **Correlating the pos/neg ratio with the amount of extremes sentiment scores compound**

In [None]:
positive_entries = df[df['sentiment_score_binary'] > 0]
negative_entries = df[df['sentiment_score_binary'] < 0]

pos_counts = positive_entries.groupby('countries').size()
neg_counts = negative_entries.groupby('countries').size()
pos_neg_ratio = pos_counts / neg_counts.replace(0, np.nan)
pos_neg_ratio.name = 'pos_neg_ratio'

extreme_entries = df[(df['sentiment_score_compound'] > 0.75) | (df['sentiment_score_compound'] < -0.75)]
extreme_counts = extreme_entries.groupby('countries').size()
total_counts = df.groupby('countries').size()
extreme_ratios = extreme_counts / total_counts
extreme_ratios.name = 'extreme_ratio'

aligned = pd.concat([extreme_ratios, pos_neg_ratio], axis=1).dropna()
print("Positive to Negative Ratios per country:")
print(pos_neg_ratio)
print("\nExtreme Ratios per country:")
print(extreme_ratios)
r, p_value = pearsonr(aligned['pos_neg_ratio'], aligned['extreme_ratio'])
print(f"\nPearson correlation coefficient: {r:.3f}, p-value: {p_value:.3f}")

In [None]:
df_filtered = df[df['ideology'].isin(['left', 'right'])].copy()

left_subj = df_filtered[df_filtered['ideology'] == 'left']['subjectivity_score']
right_subj = df_filtered[df_filtered['ideology'] == 'right']['subjectivity_score']

left_sentiment = df_filtered[df_filtered['ideology'] == 'left']['sentiment_score_compound'].dropna()
right_sentiment = df_filtered[df_filtered['ideology'] == 'right']['sentiment_score_compound'].dropna()

left_corr_df = df_filtered[df_filtered['ideology'] == 'left'][['subjectivity_score', 'sentiment_score_compound']].dropna()
right_corr_df = df_filtered[df_filtered['ideology'] == 'right'][['subjectivity_score', 'sentiment_score_compound']].dropna()

r_left, p_left = pearsonr(left_corr_df['subjectivity_score'], left_corr_df['sentiment_score_compound'])
r_right, p_right = pearsonr(right_corr_df['subjectivity_score'], right_corr_df['sentiment_score_compound'])

bins = 20
counts_left, bins_left = np.histogram(left_subj, bins=bins, range=(0,1))
counts_right, bins_right = np.histogram(right_subj, bins=bins, range=(0,1))

counts_left_norm = counts_left / counts_left.sum()
counts_right_norm = counts_right / counts_right.sum()

mean_left = left_subj.mean()
mean_right = right_subj.mean()

bin_centers_left = (bins_left[:-1] + bins_left[1:]) / 2
bin_centers_right = (bins_right[:-1] + bins_right[1:]) / 2

fig, axes = plt.subplots(1, 2, figsize=(14, 5), constrained_layout=True)

axes[0].bar(bin_centers_left, counts_left_norm, width=0.045, color='red', edgecolor='black', alpha=0.7)
axes[0].set_title('Higher left mentions subjectivity score')
axes[0].set_xlabel('Subjectivity score')
axes[0].set_ylabel('Normalized frequency')
axes[0].set_xlim(0, 1)
axes[0].text(0.6, max(counts_left_norm)*0.9, f'Avg subjectivity: {mean_left:.3f}', fontsize=10)
axes[0].legend([f'Corr with sentiment: r={r_left:.3f}, p={p_left:.3f}'])

axes[1].bar(bin_centers_right, counts_right_norm, width=0.045, color='blue', edgecolor='black', alpha=0.7)
axes[1].set_title('Right Mentions Subjectivity Score')
axes[1].set_xlabel('Subjectivity Score')
axes[1].set_ylabel('Normalized Frequency')
axes[1].set_xlim(0, 1)
axes[1].text(0.6, max(counts_right_norm)*0.9, f'Avg Subjectivity: {mean_right:.3f}', fontsize=10)
axes[1].legend([f'Corr with Sentiment: r={r_right:.3f}, p={p_right:.3f}'])

plt.show()

## Study by topic

In [None]:
topic_counts = df['topic'].str.lower().value_counts()
print("Number of entries per topic:")
print(topic_counts)

- **Mean sentiment score compound by topic**

In [None]:
n_iterations = 1000
boot_results = []
topics_of_interest = ['war', 'abortion']
topic_boot_means = {}

for topic in topics_of_interest:
    group = df[df['topic'].str.lower() == topic] 
    scores = group['sentiment_score_compound'].dropna()
    
    boot_means = []
    for _ in range(n_iterations):
        # Bootstrap sampling with replacement
        sample = scores.sample(n=len(scores), replace=True)
        boot_means.append(sample.mean())
    
    topic_boot_means[topic] = boot_means

boot_df = pd.DataFrame(topic_boot_means)
print (boot_df.describe())
# Plot
plt.figure(figsize=(12, 6))
sns.violinplot(data=boot_df, palette='Set2', inner='quartile')
plt.title("Bootstrapped Mean Sentiment Score for War and Abortion Topics")
plt.xlabel("Topic")
plt.ylabel("Bootstrapped Mean Sentiment Score")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid(True)
plt.show()

- **Histogram of the sentiment score compound per topic**

In [None]:
topic_colors = {
    'abortion': '#fc8d62', 
    'war': '#66c2a5'       
}

In [None]:
n_cols = 3
n_rows = int(np.ceil(len(countries) / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(6.5 * n_cols, 5.5 * n_rows), sharex=False, sharey=False)
axes = axes.flatten()

bins = 20
bin_edges = np.linspace(-1, 1, bins + 1)
bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])

for idx, country in enumerate(countries):
    ax = axes[idx]
    for topic in topics:
        subset = df[(df['countries'] == country) & (df['topic'] == topic)]['sentiment_score_compound'].dropna()
        counts, _ = np.histogram(subset, bins=bin_edges)
        proportions = counts / counts.sum() if counts.sum() > 0 else np.zeros_like(counts)
        ax.bar(
            bin_centers,
            proportions,
            width=(bin_edges[1] - bin_edges[0]),
            color=topic_colors.get(topic, 'gray'),
            edgecolor='black',
            alpha=0.6,
            label=topic.capitalize()
        )
    
    ax.set_title(f"{country_map.get(country, country)}", fontsize=14)
    ax.set_xlabel('Sentiment score', fontsize=12)
    ax.set_ylabel('Proportion', fontsize=12)
    ax.set_xticks(np.round(np.linspace(-1, 1, 5), 2))
    ax.set_yticks(np.linspace(0, 0.4, 5))
    ax.tick_params(axis='x', rotation=45, labelsize=11)
    ax.tick_params(axis='y', labelsize=11)
    
    handles, labels = ax.get_legend_handles_labels()
    if len(handles) > 1:
        handles.append(plt.Rectangle((0, 0), 1, 1, color='brown', alpha=0.6, edgecolor='black'))
        labels.append('Overlap')
    ax.legend(handles, labels, fontsize=11)

# Remove unused axes
for ax in axes[len(countries):]:
    fig.delaxes(ax)

plt.suptitle("Compound sentiment score distribution by country and topic", fontsize=20, y=1.03)
plt.tight_layout()
plt.show()

In [None]:
# Count positive and negative per country-topic
ratio_df = df.groupby(['countries', 'topic', 'sentiment_score_binary']).size().unstack(fill_value=0)
ratio_df['pos_neg_ratio'] = ratio_df.get(1, 0) / ratio_df.get(-1, 1)

print(ratio_df[['pos_neg_ratio']])


- **Do the "polarized" countries have a greater distinction between war and abortion?**

In [None]:
counts = df.groupby(['countries', 'topic', 'sentiment_score_binary']).size().unstack(fill_value=0)
counts['pos_neg_ratio'] = counts.get(1, 0) / counts.get(-1, 1)
ratio_df = counts[['pos_neg_ratio']]

# Compute difference between abortion and war ratio
pivot = ratio_df['pos_neg_ratio'].unstack()
pivot['ratio_diff'] = (pivot['abortion'] - pivot['war']).abs()

# Compute extreme sentiment ratio per country
total_counts = df['countries'].value_counts()
extreme_df = df[(df['sentiment_score_compound'] > 0.75) | (df['sentiment_score_compound'] < -0.75)]
extreme_counts = extreme_df['countries'].value_counts()
extreme_ratios = (extreme_counts / total_counts).fillna(0)
extreme_ratios.index = [list(i)[0] if isinstance(i, set) else i for i in extreme_ratios.index]

# Merge and correlate
correlation_df = pd.DataFrame({
    'extreme_ratio': extreme_ratios,
    'ratio_diff': pivot['ratio_diff']
}).dropna()

r, p_value = pearsonr(correlation_df['ratio_diff'], correlation_df['extreme_ratio'])
print(f"Pearson correlation coefficient: {r:.3f}")
print(f"P-value: {p_value:.3e}")


In [None]:
plt.figure(figsize=(8, 6))
sns.regplot(data=correlation_df, x='ratio_diff', y='extreme_ratio', ci=None, scatter_kws={'s': 70})
plt.title('Correlation between Topic Sentiment Gap and Sentiment Polarization')
plt.xlabel('Abs Difference in Pos/Neg Ratio (Abortion vs. War)')
plt.ylabel('Extreme Sentiment Ratio')
plt.grid(True)
plt.tight_layout()
plt.show()

- **% of extreme sentiment entries (1, -1) per country and topic**

In [None]:
total_counts = df.groupby(['countries', 'topic']).size().reset_index(name='total_count')
strong_pos = df[df['sentiment_score_compound'] >= 0.75]
pos_counts = strong_pos.groupby(['countries', 'topic']).size().reset_index(name='pos_count')
strong_neg = df[df['sentiment_score_compound'] <= -0.75]
neg_counts = strong_neg.groupby(['countries', 'topic']).size().reset_index(name='neg_count')

# Merge all
merged = total_counts.merge(pos_counts, on=['countries', 'topic'], how='left')
merged = merged.merge(neg_counts, on=['countries', 'topic'], how='left')
merged['pos_count'] = merged['pos_count'].fillna(0)
merged['neg_count'] = merged['neg_count'].fillna(0)

# Compute percentages
merged['pos_percent'] = 100 * merged['pos_count'] / merged['total_count']
merged['neg_percent'] = 100 * merged['neg_count'] / merged['total_count']
merged = merged.sort_values(by='pos_percent', ascending=False)
print(merged[['countries', 'topic', 'pos_percent', 'neg_percent']])


- **Are the extreme sentiments related to the percent of positiveness?**

In [None]:
total_country_counts = df['countries'].value_counts()
extreme_df = df[(df['sentiment_score_compound'] > 0.75) | (df['sentiment_score_compound'] < -0.75)]
extreme_country_counts = extreme_df['countries'].value_counts()
extreme_ratios = (extreme_country_counts / total_country_counts).fillna(0).reset_index()
extreme_ratios.columns = ['countries', 'extreme_ratio']

# Merge the two metrics
relation_df = merged[['countries', 'pos_percent']].merge(extreme_ratios, on='countries', how='inner')

relation_df = relation_df.sort_values(by='extreme_ratio', ascending=False)
print(relation_df)


In [None]:
# Plotting
plt.figure(figsize=(10, 6))
sns.regplot(data=relation_df, x='pos_percent', y='extreme_ratio', scatter_kws={'s': 60}, line_kws={'color': 'red'})
plt.title('Correlation between Positive Sentiment Percentage and Extreme Opinion Ratio')
plt.xlabel('Average Positive Sentiment Percentage (per country)')
plt.ylabel('Extreme Sentiment Ratio (per country)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Compute Pearson correlation
corr_coef, p_value = pearsonr(relation_df['pos_percent'], relation_df['extreme_ratio'])
print(f"Pearson correlation coefficient: {corr_coef:.3f} (p-value: {p_value:.4f})")

- **Are extreme sentiments realted to subjetivity?**

In [None]:
total_counts = df.groupby(['countries', 'topic']).size().reset_index(name='total_count')

# Filter non-extreme sentiment entries (between -0.75 and 0.75)
neutral_sentiment = df[(df['sentiment_score_compound'] > -0.75) & (df['sentiment_score_compound'] < 0.75)]
neutral_counts = neutral_sentiment.groupby(['countries', 'topic']).size().reset_index(name='neutral_count')

merged_neutral = total_counts.merge(neutral_counts, on=['countries', 'topic'], how='left')
merged_neutral['neutral_count'] = merged_neutral['neutral_count'].fillna(0)
merged_neutral['neutral_percent'] = 100 * merged_neutral['neutral_count'] / merged_neutral['total_count']

# Compute mean subjectivity per country-topic
subjectivity_means = df.groupby(['countries', 'topic'])['subjectivity_score'].mean().reset_index()
subjectivity_means.rename(columns={'subjectivity_score': 'mean_subjectivity'}, inplace=True)

merged_neutral = merged_neutral.merge(subjectivity_means, on=['countries', 'topic'], how='left')
merged_neutral_clean = merged_neutral.dropna(subset=['mean_subjectivity', 'neutral_percent'])
corr_neutral, p_neutral = pearsonr(merged_neutral_clean['mean_subjectivity'], merged_neutral_clean['neutral_percent'])

# Print results
print(f"Correlation between mean subjectivity and % non-extreme sentiment: {corr_neutral:.3f} (p = {p_neutral:.3f})")


- **Relating the topics with the intensity of each sentiment**

In [None]:
# Define the list of sentiment columns
sentiment = [
    'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

abortion_df = df[df['topic'].str.lower().str.contains('abortion')]
war_df = df[df['topic'].str.lower().str.contains('war')]
abortion_means = abortion_df[sentiment].mean()
war_means = war_df[sentiment].mean()
average_intensity_df = pd.DataFrame({
    'Abortion': abortion_means,
    'War': war_means
})

# Plot
average_intensity_df.plot(kind='bar', figsize=(15, 6), colormap='Set2')
plt.title("Average Sentiment Intensity: Abortion vs. War Videos")
plt.ylabel("Average Intensity (0 to 1)")
plt.xticks(rotation=45, ha='right')
plt.legend(title='Topic')
plt.tight_layout()
plt.show()

In [None]:
sentiment = [
    'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

abortion_df = df[df['topic'].str.lower().str.contains('abortion')]
war_df = df[df['topic'].str.lower().str.contains('war')]
n_abortion = len(abortion_df)
n_war = len(war_df)
abortion_counts = (abortion_df[sentiment] > 0).sum() / n_abortion
war_counts = (war_df[sentiment] > 0).sum() / n_war
sentiment_normalized_df = pd.DataFrame({
    'Abortion': abortion_counts,
    'War': war_counts
})

# Plot
sentiment_normalized_df.plot(kind='bar', figsize=(15, 6), colormap='Set2')
plt.title("Normalized Sentiment Mentions in Abortion vs. War Topics")
plt.ylabel("Proportion of Entries with Sentiment (Value > 0)")
plt.xticks(rotation=45, ha='right')
plt.legend(title='Topic')
plt.tight_layout()
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.show()


- **Relating the topics with the amount of times each sentiment appears**

In [None]:
sentiment = [
    'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

abortion_df = df[df['topic'].str.lower().str.contains('abortion')]
war_df = df[df['topic'].str.lower().str.contains('war')]
abortion_counts = (abortion_df[sentiment] > 0).sum()
war_counts = (war_df[sentiment] > 0).sum()
sentiment_counts_df = pd.DataFrame({
    'Abortion': abortion_counts,
    'War': war_counts
})

# Plot
sentiment_counts_df.plot(kind='bar', figsize=(15, 6), colormap='Set3')
plt.title("Sentiment Occurrences in Abortion vs. War Topics")
plt.ylabel("Number of Mentions (Value > 0)")
plt.xticks(rotation=45, ha='right')
plt.legend(title='Topic')
plt.tight_layout()
plt.show()


- **Ratio of appearence of each sentiment**

In [None]:
sentiment = [
    'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

abortion_df = df[df['topic'].str.lower().str.contains('abortion')]
war_df = df[df['topic'].str.lower().str.contains('war')]

abortion_total = len(abortion_df)
war_total = len(war_df)

abortion_ratios = (abortion_df[sentiment] > 0).sum() / abortion_total
war_ratios = (war_df[sentiment] > 0).sum() / war_total

sentiment_ratio_df = pd.DataFrame({
    'Abortion': abortion_ratios,
    'War': war_ratios
})

sentiment_ratio_df = sentiment_ratio_df.loc[war_ratios.sort_values(ascending=False).index]
colors = [topic_colors['abortion'], topic_colors['war']]
ax = sentiment_ratio_df.plot(kind='bar', figsize=(15, 6), color=colors)
plt.title("Ratio of Sentiment Appearance: Abortion vs. War Videos")
plt.ylabel("Proportion of Mentions (0 to 1)")
plt.xticks(rotation=45, ha='right')
plt.legend(title='Topic')
plt.tight_layout()
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
sentiment = [
    'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

records = []
for country in df['countries'].unique():
    country_df = df[df['countries'] == country]
    
    for topic in ['abortion', 'war']:
        topic_df = country_df[country_df['topic'].str.lower().str.contains(topic)]
        total = len(topic_df)
        if total == 0:
            continue  
        ratios = (topic_df[sentiment] > 0).sum() / total
        record = {'country': country, 'topic': topic}
        record.update(ratios.to_dict())
        records.append(record)

country_sentiment_df = pd.DataFrame(records)
print(country_sentiment_df)


- **Which sentiments appear the most per country?**

In [None]:
sentiment = [
    'pain', 'negative_emotion', 'religion',
    'independence', 'fear','pro_stance', 'moral_dilemma', 'human_rights',
    'abortion_rights', 'war_justification', 'womens_rights', 'disagreement'
]

grouped_counts = (
    df.groupby(['countries', 'topic'])[sentiment]
    .apply(lambda g: (g > 0).sum())
    .reset_index()
)

melted = grouped_counts.melt(id_vars=['countries', 'topic'], var_name='sentiment', value_name='count')

# Plot
plt.figure(figsize=(18, 8))
sns.catplot(
    data=melted,
    x='sentiment',
    y='count',
    hue='topic',
    col='countries',
    kind='bar',
    col_wrap=4,
    height=4,
    aspect=1.2,
    sharey=False
)

plt.subplots_adjust(top=0.9)
plt.suptitle("Sentiment Occurrences by Country and Topic", fontsize=16)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:

abortion_topics_group = ['abortion_rights', 'disagreement', 'moral_dilemma', 'religion', 'pro_stance']
war_topics_group = ['war_justification', 'disagreement', 'moral_dilemma', 'religion', 'pro_stance']

melted['country'] = melted['countries'].map(country_map)

def filter_and_normalize(df, topics_filter):
    df_filtered = df[df['sentiment'].isin(topics_filter)]
    return df_filtered.groupby('country').apply(
        lambda g: g.assign(norm_count = g['count'] / g['count'].sum())
    ).reset_index(drop=True)

abortion_data = filter_and_normalize(melted, abortion_topics_group)
war_data = filter_and_normalize(melted, war_topics_group)

colors_abortion = {c: color_map[c] for c in abortion_data['country'].unique()}
colors_war = {c: color_map[c] for c in war_data['country'].unique()}

plt.figure(figsize=(14, 7))
sns.barplot(
    data=abortion_data,
    x='sentiment',
    y='norm_count',
    hue='country',
    palette=colors_abortion
)
plt.title("Normalized Sentiment Activations for Abortion-related Topics")
plt.ylabel("Normalized Count (Proportion)")
plt.xlabel("Sentiment")
plt.xticks(rotation=45)
plt.legend(title='Country')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 7))
sns.barplot(
    data=war_data,
    x='sentiment',
    y='norm_count',
    hue='country',
    palette=colors_war
)
plt.title("Normalized Sentiment Activations for War-related Topics")
plt.ylabel("Normalized Count (Proportion)")
plt.xlabel("Sentiment")
plt.xticks(rotation=45)
plt.legend(title='Country')
plt.tight_layout()
plt.show()


# Political-Wing Analysis (Presence and topic relation)

- **Amount of entries of each predominant idology**

In [None]:
ideology_counts = df['ideology'].value_counts()

print("Ideology Counts:")
print(f"Left: {ideology_counts.get('left', 0)}")
print(f"Right: {ideology_counts.get('right', 0)}")
print(f"Mixed: {ideology_counts.get('mixed', 0)}")
print(f"No mention: {ideology_counts.get('no mention', 0)}")

In [None]:
mentioned_df = df[df['ideology'] != 'no mention']
countries_with_mentions = mentioned_df['countries'].unique()
mentions_count = mentioned_df['countries'].value_counts()
print("Countries that mention at least one political party:")
print(countries_with_mentions)
print("\nNumber of mentions per country:")
print(mentions_count)


- **Mean % left and %right by country**

In [None]:
filtered = df[(df['% left'] > 0) | (df['% right'] > 0)]
countries = filtered['countries'].unique()
mean_by_country = filtered.groupby('countries')[['% left', '% right']].mean() * 100
print("Mean % Left and Right by Country (based on bar values):\n")
for country, row in mean_by_country.iterrows():
    print(f"{country}: Left = {row['% left']:.2f}%, Right = {row['% right']:.2f}%")


- **% of right and left parties mentioned by country**

In [None]:
mean_ideology = df.groupby('countries')[['% left', '% right']].mean().reset_index()
print(mean_ideology)

- **Relate the % of the wings mentioned with the mean sentiment score isolated by ideology**

In [None]:
clean_df = df[['sentiment_score_compound', '% left', '% right']].dropna()
corr_coef_left, p_value = pearsonr(clean_df['sentiment_score_compound'], clean_df['% left'])
print(f"Pearson correlation coefficient for % left: {corr_coef_left:.3f} (p-value: {p_value:.4f})")
corr_coef_right, p_value = pearsonr(clean_df['sentiment_score_compound'], clean_df['% right'])
print(f"Pearson correlation coefficient for % right: {corr_coef_right:.3f} (p-value: {p_value:.4f})")


fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
# Left subplot: % Left
sns.regplot(data=clean_df, x='% left', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[0].set_title('Sentiment vs % Left')
axes[0].set_xlabel('% Left')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Right subplot: % Right
sns.regplot(data=clean_df, x='% right', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'green'})
axes[1].set_title('Sentiment vs % Right')
axes[1].set_xlabel('% Right')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
df['political_bias'] = df['% left'] - df['% right']
clean_df = df[['political_bias', 'sentiment_score_compound']].dropna()
r, p_value = pearsonr(clean_df['political_bias'], clean_df['sentiment_score_compound'])
print(f"Pearson correlation coefficient: {r:.3f} (p-value: {p_value:.4f})")

# Plot
plt.figure(figsize=(8, 6))
sns.regplot(data=clean_df, x='political_bias', y='sentiment_score_compound', color='teal')
plt.title("Sentiment Score vs Political Bias (% Left - % Right)")
plt.xlabel("Political Bias (% Left - % Right)")
plt.ylabel("Sentiment Compound Score")
plt.grid(True)
plt.tight_layout()
plt.show()


- **Relate the % of the wings mentioned with the mean sentiment score isolated by ideology and country**

In [None]:
countries = df['countries'].dropna().unique()
results = []

for country in countries:
    country_df = df[df['countries'] == country][['sentiment_score_compound', '% left', '% right']].dropna()
    
    if len(country_df) > 2: 
        corr_left, p_left = pearsonr(country_df['sentiment_score_compound'], country_df['% left'])
        corr_right, p_right = pearsonr(country_df['sentiment_score_compound'], country_df['% right'])
        
        results.append({
            'country': country,
            'corr_left': corr_left,
            'p_left': p_left,
            'corr_right': corr_right,
            'p_right': p_right
        })

correlation_df = pd.DataFrame(results)
print(correlation_df)

# Plot
correlation_df.set_index('country')[['corr_left', 'corr_right']].plot(kind='bar', figsize=(12,6), color=['purple', 'green'])
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
plt.title('Sentiment vs Political Leaning Correlations by Country')
plt.ylabel('Pearson Correlation')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
countries = df['countries'].dropna().unique()
results = []

for country in countries:
    country_df = df[df['countries'] == country][['sentiment_score_compound', '% left', '% right']].dropna()
    if len(country_df) > 2:
        diff = country_df['% left'] - country_df['% right']
        corr, p_value = pearsonr(diff, country_df['sentiment_score_compound'])
        results.append({
            'country': country_map.get(country, country),
            'diff_mean': diff.mean(),
            'correlation': corr,
            'p_value': p_value
        })

correlation_df = pd.DataFrame(results)

fig, ax1 = plt.subplots(figsize=(12, 6))
color = [color_map.get(c, 'gray') for c in correlation_df['country']]
bars = ax1.bar(correlation_df['country'], correlation_df['diff_mean'], color=color, alpha=0.6)
ax1.set_ylabel('Mean (% Left - % Right)')
ax1.set_title('Difference in % Mentions of Political Wings and Sentiment Correlation')
ax1.axhline(0, linestyle='--', color='black')

ax2 = ax1.twinx()
ax2.plot(correlation_df['country'], correlation_df['correlation'], color='black', marker='o', label='Correlation with Sentiment')
ax2.set_ylabel('Pearson Correlation with Sentiment')

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


- **Sentiment Score for Abortion content grouped by political wings**

In [None]:
abortion_df = df[df['topic'].str.lower() == 'abortion']
abortion_clean = abortion_df[['sentiment_score_compound', '% right', '% left']].dropna()
corr_right, p_right = pearsonr(abortion_clean['sentiment_score_compound'], abortion_clean['% right'])
corr_left, p_left = pearsonr(abortion_clean['sentiment_score_compound'], abortion_clean['% left'])
print(f"Pearson correlation (Abortion) with % Right: {corr_right:.3f} (p = {p_right:.4f})")
print(f"Pearson correlation (Abortion) with % Left:  {corr_left:.3f} (p = {p_left:.4f})")

fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for % right
sns.regplot(data=abortion_clean, x='% right', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'red'})
axes[0].set_title('Sentiment Score vs % Right (Abortion)')
axes[0].set_xlabel('% Right')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Plot for % left
sns.regplot(data=abortion_clean, x='% left', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[1].set_title('Sentiment Score vs % Left (Abortion)')
axes[1].set_xlabel('% Left')
axes[1].grid(True)

plt.tight_layout()
plt.show()


- **Sentiment Score for War content grouped by political wings**

In [None]:
war_df = df[df['topic'].str.lower() == 'war']
war_clean = war_df[['sentiment_score_compound', '% right', '% left']].dropna()
corr_right, p_right = pearsonr(war_clean['sentiment_score_compound'], war_clean['% right'])
corr_left, p_left = pearsonr(war_clean['sentiment_score_compound'], war_clean['% left'])
print(f"Pearson correlation (War) with % Right: {corr_right:.3f} (p = {p_right:.4f})")
print(f"Pearson correlation (War) with % Left:  {corr_left:.3f} (p = {p_left:.4f})")

fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for % right
sns.regplot(data=war_clean, x='% right', y='sentiment_score_compound', ax=axes[0], scatter_kws={'s': 50}, line_kws={'color': 'red'})
axes[0].set_title('Sentiment Score vs % Right (War)')
axes[0].set_xlabel('% Right')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True)

# Plot for % left
sns.regplot(data=war_clean, x='% left', y='sentiment_score_compound', ax=axes[1], scatter_kws={'s': 50}, line_kws={'color': 'purple'})
axes[1].set_title('Sentiment Score vs % Left (War)')
axes[1].set_xlabel('% Left')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
df['political_bias'] = df['% left'] - df['% right']
topics = {'abortion': 'orange', 'war': 'green'}
topic_filter = df[df['topic'].isin(['abortion', 'war'])]

plt.figure(figsize=(10, 6))

for topic, color in topics.items():
    subset = topic_filter[topic_filter['topic'] == topic][['political_bias', 'sentiment_score_compound']].dropna()
    r, p = pearsonr(subset['political_bias'], subset['sentiment_score_compound'])
    sns.regplot(data=subset, x='political_bias', y='sentiment_score_compound', label=f'{topic.title()} (r={r:.2f})', color=color)

plt.title("Sentiment Score vs Political Bias by Topic")
plt.xlabel("Political Bias (% Left - % Right)")
plt.ylabel("Sentiment Compound Score")
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


- **From which wing are the most mentioned parties?**

In [None]:
left = [
    "Die Linke", "IU", "Podemos", "PCE", "PCF", "LFI", "PRC", "Syriza", 
    "Vänsterpartiet", "Vasemmistoliitto", "AKEL", "PTB", "KPÖ", "Enhedslisten", 
    "Rødt", "PST/POP", "PIE", "The Left", "Razem", "EFA", "S&D", "Renew Europe", 
    "PSOE", "Partido Socialista Obrero Español", "Sumar", "PES", "PS", 
    "Parti Socialiste", "APSD", "SD", "SAP", "Labour", "SPÖ", "Vooruit", "SPD", 
    "Sozialdemokratische Partei Deutschlands", "NL", "Nowa Lewica", "PvdA", 
    "Partij van de Arbeid", "Socialist Party", "Democratic Party", "Labour Party", 
    "PASOK", "SLD", "Nouvelle Donne", "PRG", "Inicjatywa Polska", "Grüne", 
    "Greens", "ERC", "Esquerra Republicana de Catalunya", "EGP", "The Greens", 
    "BNG", "Bloque Nacionalista Galego", "LE", "Les Écologistes", "The greens", 
    "GL", "GroenLinks", "SMR", "Bildu", "Euskal Herria Bildu", "Left Party", "PvdD", 
    "Partij voor de Dieren"
]

right = [
    "EPP", "European People's Party", "ECR", "PiS", "Law and Justice", "VOX", 
    "RN", "National Rally", "FPÖ", "Fidesz", "Patriots", "ESN", 
    "Europe of Sovereign Nations", "AfD", "Alternative für Deutschland", "Republika", 
    "Reconquête", "NOWA NADZIEJA", "New Hope", "Mi Hazánk", "PP", "Partido Popular", 
    "CDU", "Christlich Demokratische Union Deutschlands", "Agir", "MoDem", 
    "Mouvement Démocrate", "Ensemble", "LFA", "RE", "Renaissance", "LR", 
    "Les Républicains", "CDA", "Christen-Democratisch Appèl", "NSC", 
    "New Social Contract", "IDP", "CSU", "Christlich-Soziale Union in Bayern", 
    "FDP", "Freie Demokratische Partei", "FW", "Freie Wähler", "Junts", "ZP", "NPD", 
    "PVV", "Partij voor de Vrijheid", "FvD", "European People's Party", 
    "Progressive Alliance of Socialists & Democrats", "PO", "Platforma Obywatelska", 
    "PSL", "Polskie Stronnictwo Ludowe", "BBB", "BoerBurgerBeweging", "Familie", 
    "ÖDP", "Ökologisch-Demokratische Partei", "UDR", "Union des Démocrates et Indépendants", 
    "PfE", "Patriots of Europe", "D66", "Democraten 66", "PL2050", "Polska 2050", 
    "RECONQUÊTE", "R!", "NN", "Nieuwe Nationale Partij"
]

In [None]:
df['parties_mentioned'] = df['parties_mentioned'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [None]:
all_parties = df['parties_mentioned'].explode()
party_counts = Counter(all_parties)

top_20_parties = party_counts.most_common(20)
party_country_dict = {}
party_wing_dict = {}

for country in df['countries'].unique():
    country_df = df[df['countries'] == country]
    country_parties = country_df['parties_mentioned'].explode()
    country_party_counts = Counter(country_parties)
    
    for party in country_party_counts:
        if party in dict(top_20_parties):
            if party not in party_country_dict or country_party_counts[party] > party_country_dict[party][1]:
                party_country_dict[party] = (country, country_party_counts[party])
                
                if party in left:
                    party_wing_dict[party] = 'left'
                elif party in right:
                    party_wing_dict[party] = 'right'
                else:
                    party_wing_dict[party] = 'Unknown'  # If the party is not in either list

print("Top 20 Most Mentioned Political Parties:")

for party, count in top_20_parties:
    most_mentioned_country, mentions_in_country = party_country_dict[party]
    political_wing = party_wing_dict.get(party, 'Unknown')
    print(f"{party} ({political_wing}): {count} mentions, most mentioned in {most_mentioned_country} ({mentions_in_country} mentions)")

- **Relate the parties mentioned with the results of the elections from 2024**

In [None]:
election_results = {
    'EPP': 188,
    'S&D': 136,
    'PfE': 84,
    'ECR': 78,
    'Renew Europe': 77,
    'EFA': 53,
    'Greens': 53,
    'The Left': 46,
    'ESN': 25,
    'NI': 33
}

In [None]:
party_group_mapping = {
    'EPP' : 'EPP',                      # -------- European People's Party --------
    'PP': 'EPP',                        # Spain - Partido Popular
    'PSL': 'EPP',                       # Poland
    'PO': 'EPP',                        # Poland
    'BBB': 'EPP',                       # Netherlands
    'CDA': 'EPP',                       # Netherlands
    'PVV': 'EPP',                       # Netherlands
    'CDU': 'EPP',                       # Germany
    'ÖDP': 'EPP',                       # Germany
    'CSU': 'EPP',                       # Germany
    'FAMILIE': 'EPP',                   # Germany
    'LR': 'EPP',                        # France
    'UDR' : 'EPP',                      # France
    'NSC' : 'EPP',                      # Netherlands - New Social Contract
    'S&D': 'S&D',                       # -------- European --------
    'PSOE': 'S&D',                      # Spain
    'SPD': 'S&D',                       # Germany
    'NL' : 'S&D',                       # Poland - New Left
    'PS' : 'S&D',                       # France - Parti Socialiste
    'PvdA': 'S&D',                      # Netherlands - Labour Party
    'PES' : 'S&D',                      # -------- Party of European Socialists --------
    'PfE' : 'PfE',                      # -------- European Patriots of Europe --------
    'VOX' : 'PfE',                      # Spain - Vox
    'RN' : 'PfE',                       # France - National Rally || Poland - National Movement
    'PVV' : 'PfE',                      # Netherlands - Party for Freedom
    'ECR' : 'ECR',                      # -------- European Conservatives and Reformists --------
    'SALF' : 'ECR',                     # Spain - Se acabo la fiesta
    'IDL' : 'ECR',                      # France - Identity and Liberty
    'PiS' : 'ECR',                      # Poland - Law and Justice
    'SGP' : 'ECR',                      # Netherlands - Reformed Political Party
    'Renew Europe' : 'Renew Europe',    # -------- Renew Europe --------
    'PNV' : 'Renew Europe',             # Spain - Basque Nationalist Party
    'MoDem' : 'Renew Europe',           # France - Democratic Movement
    'RE' : 'Renew Europe',              # France - Renaissance
    'UDI' : 'Renew Europe',             # France - Union of Democrats and Independents
    'FDP' : 'Renew Europe',             # Germany - Free Democratic Party
    'FW' : 'Renew Europe',              # Germany - Free Voters
    'VVD' : 'Renew Europe',             # Netherlands - People's Party for Freedom and Democracy
    'D66' : 'Renew Europe',             # Netherlands - Democrats 66
    'PL2050' : 'Renew Europe',          # Poland - Poland 2050
    'Greens' : 'Greens',                # -------- The Greens --------
    'EFA' : 'Greens',                   # -------- European Free Alliance --------
    'ERC' : 'Greens',                   # Spain - Republican Left of Catalonia
    'BNG' : 'Greens',                   # Spain - Galician Nationalist Bloc
    'EGP' : 'Greens',                   # European Green Party 
    'LE' : 'Greens',                    # France - Les Écologistes - The Greens 
    'The greens' : 'Greens',            # Germany - The Greens
    'GL' : 'Greens',                    # Netherlands - GroenLinks
    'The Left' : 'The Left',            # -------- The Left --------
    'Podemos' : 'The Left',             # Spain - Podemos
    'IU' : 'The Left',                  # Spain - United Left
    'Sumar' : 'The Left',               # Spain - Sumar
    'SMR' : 'The Left',                 # Spain - Sumar
    'Bildu' : 'The Left',               # Spain - Bildu
    'LFI' : 'The Left',                 # France - La France Insoumise
    'Die Linke' : 'The Left',           # Germany - The Left
    'Left party' : 'The Left',          # Germany - The Left
    'PvdD' : 'The Left',                # Netherlands - Party for the Animals
    'ESN' : 'ESN',                      # -------- Europe of Sovereign Nations --------
    'RECONQUÊTE' : 'ESN',               # France - Reconquête
    'R!' : 'ESN',                       # France - Reconquête
    'AfD' : 'ESN',                      # Germany - Alternative for Germany
    'NN' : 'ESN',                       # Netherlands - New Hope
}

In [None]:
# Flatten all mentions from the whole dataframe
all_mentions_flat = sum(df['parties_mentioned'], [])
all_party_mentions = Counter(all_mentions_flat)

sorted_party_mentions = all_party_mentions.most_common()

for party, mentions in sorted_party_mentions[:20]:
    if party in election_results:
        group = party
    elif party in party_group_mapping:
        group = party_group_mapping[party]
    else:
        group = None

    if group:
        seats = election_results.get(group, "No data")
        print(f"{party} - Mentions: {mentions}, Group: {group}, Seats: {seats}")
    else:
        print(f"{party} - Mentions: {mentions}, Group: Unknown, Seats: No data")


In [None]:
country_party_counts = defaultdict(Counter)
for _, row in df.iterrows():
    country = row['countries']
    parties = row['parties_mentioned']
    
    if isinstance(parties, list):
        for party in parties:
            country_party_counts[country][party] += 1

for country, party_counter in country_party_counts.items():
    print(f"\nTop 10 parties in {country}:")
    for party, count in party_counter.most_common(10):
        print(f"  {party}: {count} mentions")

In [None]:
country_party_counts = defaultdict(Counter)

for _, row in df.iterrows():
    countries = row['countries']
    parties = row['parties_mentioned']

    if isinstance(countries, str):
        countries = [c.strip() for c in countries.split(',')]
    if isinstance(parties, list):
        for country in countries:
            for party in parties:
                country_party_counts[country][party] += 1

num_countries = len(country_party_counts)
num_cols = 3
num_rows = (num_countries + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 5 * num_rows))
axes = axes.flatten()

for ax, (country, party_counts) in zip(axes, country_party_counts.items()):
    top_parties = party_counts.most_common(10)
    labels = [p[0] for p in top_parties]
    sizes = [p[1] for p in top_parties]

    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 9})
    ax.axis('equal')  # Asegura que sea un círculo
    ax.set_title(f"{country}", fontsize=12)

for i in range(len(country_party_counts), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
from collections import defaultdict, Counter

country_party_counts = defaultdict(Counter)

for _, row in df.iterrows():
    countries = row['countries']
    parties = row['parties_mentioned']

    if isinstance(countries, str):
        countries = [c.strip() for c in countries.split(',')]
    if isinstance(parties, list):
        for country_code in countries:
            country = country_map.get(country_code, country_code)
            for party in parties:
                country_party_counts[country][party] += 1

num_countries = len(country_party_counts)
num_cols = 3
num_rows = (num_countries + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 5 * num_rows))
axes = axes.flatten()

for ax, (country, party_counts) in zip(axes, country_party_counts.items()):
    top_parties = party_counts.most_common(10)
    labels = [p[0] for p in top_parties]
    sizes = [p[1] for p in top_parties]
    color = color_map.get(country, 'gray')
    ax.barh(labels[::-1], sizes[::-1], color=color)
    ax.set_title(country, fontsize=12)
    ax.set_xlabel('Mentions')
    ax.grid(axis='x', linestyle='--', alpha=0.7)

for i in range(len(country_party_counts), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
country_map = {
    'es': 'Spain',
    'nl': 'Netherlands',
    'fr': 'France',
    'pl': 'Poland',
    'de': 'Germany'
}

In [None]:
party_name_map = {
    'the greens': 'The Greens',
    'greens': 'The Greens',
    'law and justice': 'Law and Justice',
    'pis': 'Law and Justice',
    'law and justice and pis': 'Law and Justice'
}

def normalize_party_name(name):
    name_lower = name.lower()
    return party_name_map.get(name_lower, name)

country_party_counts = defaultdict(Counter)

for _, row in df.iterrows():
    country_code = row['countries']
    country = country_map.get(country_code, country_code)
    parties = row['parties_mentioned']

    if isinstance(parties, list):
        for party in parties:
            normalized_party = normalize_party_name(party)
            country_party_counts[country][normalized_party] += 1

# Plot
if country_party_counts:
    num_countries = len(country_party_counts)
    num_cols = 3
    num_rows = (num_countries + num_cols - 1) // num_cols

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 6 * num_rows))
    axes = axes.flatten()

    for ax, (country, party_counts) in zip(axes, country_party_counts.items()):
        top_parties = party_counts.most_common(10)
        labels = [p[0] for p in top_parties]
        sizes = [p[1] for p in top_parties]
        total_mentions = sum(party_counts.values())

        colors = []
        for label in labels:
            if label in left:
                colors.append('red')
            elif label in right:
                colors.append('blue')
            else:
                colors.append('gray')

        ax.barh(labels[::-1], sizes[::-1], color=colors[::-1])
        ax.set_title(country, fontsize=16)
        ax.set_xlabel('Mentions', fontsize=16)
        ax.set_ylabel('Political Parties', fontsize=16)
        ax.tick_params(axis='both', which='major', labelsize=14)
        ax.grid(axis='x', linestyle='--', alpha=0.7)

        for i, (label, count) in enumerate(zip(labels[::-1], sizes[::-1])):
            percent = (count / total_mentions) * 100 if total_mentions else 0
            ax.text(count + 0.5, i, f'{percent:.1f}%', va='center', fontsize=12, color='black')

    for i in range(len(country_party_counts), len(axes)):
        fig.delaxes(axes[i])

    fig.suptitle('Most mentioned political entities per country', fontsize=22)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()
else:
    print("No party data available to plot.")

- **Which political wings are mentioned the most depending on the topic?**

In [None]:
filtered_df = df[(df['% left'] > 0) | (df['% right'] > 0)]
mean_ideology_by_topic = filtered_df.groupby('topic')[['% left', '% right']].mean()
mean_ideology_by_topic = mean_ideology_by_topic.sort_values('% left', ascending=False)

print("Average percentage of left and right mentions by topic (excluding no-mention entries):")
print(mean_ideology_by_topic)

- **Conclusions of this part**

# Individual sentiments analysis

- **Correlation between sentiments**

In [None]:
topics = [
    'subjectivity_score', 'pain', 'movement', 'negative_emotion', 'religion',
    'violence', 'government', 'independence', 'fear', 'trust', 'leader',
    'pro_stance', 'moral_dilemma', 'misinformation', 'human_rights',
    'abortion_rights', 'womens_rights'
]

topic_data = df[topics].dropna()
corr_matrix = topic_data.corr()

# Plot 
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True,
            cbar_kws={"label": "Correlation Coefficient"})
plt.title("Correlation Matrix Between Topics")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
total_counts = df.groupby(['countries', 'topic']).size().reset_index(name='total')
pro_stance_counts = df[df['pro_stance'] > 0].groupby(['countries', 'topic']).size().reset_index(name='pro_stance_positive')

merged = pd.merge(total_counts, pro_stance_counts, on=['countries', 'topic'], how='left')
merged['pro_stance_positive'] = merged['pro_stance_positive'].fillna(0)
merged['pro_stance_ratio'] = merged['pro_stance_positive'] / merged['total']

# Plot
pivot_ratio = merged.pivot(index='countries', columns='topic', values='pro_stance_ratio')
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_ratio.fillna(0), annot=True, cmap='Greens', fmt=".2f", vmin=0, vmax=1)
plt.title("Ratio of Pro Stance Appearance by Country and Topic")
plt.xlabel("Topic")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

- **Top 5 sentiments by country**

In [None]:
emotion_columns = [
    "negative_emotion", "fear", "trust", "pain", "movement", "religion", "violence",
    "government", "independence", "leader", "pro_stance", "moral_dilemma", "misinformation",
    "human_rights", "abortion_rights", "war_justification", "womens_rights"
]

def get_top_5_emotions(group):
    emotion_counts = (group[emotion_columns] > 0).sum()
    sorted_emotions = emotion_counts.sort_values(ascending=False)
    top_5_emotions = sorted_emotions.head(5)
    
    return pd.Series({
        'top_5_emotions': top_5_emotions.index.tolist(),
        'top_5_values': top_5_emotions.values.tolist()
    })

top_emotions = df.groupby(['countries', 'topic']).apply(get_top_5_emotions).reset_index()
top_emotions_table = top_emotions.pivot(index="countries", columns="topic", values="top_5_emotions")
top_emotions_table = top_emotions_table.applymap(lambda x: ", ".join(x))

# Plot
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=top_emotions_table.values,
                rowLabels=top_emotions_table.index,
                colLabels=top_emotions_table.columns,
                loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1.5, 1.5)
plt.title("Top 5 Emotions by Country and Topic")
plt.show()
