# Imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr

# Load dataset

In [None]:
df = pd.read_csv("../dataset/Final_TikTok.csv")

# Engagement analysis

- **Sentiment Score vs Engagement: All data**

In [None]:
cols = ['sentiment_score_compound', 'likes', 'followers', 'average_order']
subset = df[cols].dropna()

x_vars = ['average_order', 'likes', 'followers']
colors = ['teal', 'orange', 'purple']
titles = ['Average Order', 'Likes', 'Followers']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, x in enumerate(x_vars):
    r, p = pearsonr(subset[x], subset['sentiment_score_compound'])
    sns.regplot(
        data=subset,
        x=x,
        y='sentiment_score_compound',
        ax=axes[i],
        scatter_kws={'alpha': 0.3},
        line_kws={'color': colors[i]},
        color=colors[i]
    )
    axes[i].set_title(f'Sentiment vs. {titles[i]}\nPearson r = {r:.2f}, p = {p:.4f}')
    axes[i].set_xlabel(titles[i])
    axes[i].set_ylabel('Sentiment Score' if i == 0 else '')

plt.tight_layout()
plt.show()


In [None]:
cols = ['sentiment_score_compound', 'likes', 'followers', 'average_order']
subset = df[cols].dropna()

correlation_results = {}
for col in ['likes', 'followers', 'average_order']:
    r, p = pearsonr(subset['sentiment_score_compound'], subset[col])
    correlation_results[col] = {'r': r, 'p_value': p}

# Print
print("Pearson Correlation Results with 'sentiment_score_compound':")
for feature, result in correlation_results.items():
    print(f"{feature}: r = {result['r']:.3f}, p = {result['p_value']:.4f}")


- **Sentiment Score vs Engagement: Remove outliers Q1=25% and Q3=75%**

In [None]:
cols = ['sentiment_score_compound', 'likes', 'followers', 'average_order']
subset = df[cols].dropna()

# Function to remove outliers using IQR
def remove_outliers_iqr(df, columns):
    filtered_df = df.copy()
    for col in columns:
        Q1 = filtered_df[col].quantile(0.25)
        Q3 = filtered_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_df = filtered_df[(filtered_df[col] >= lower_bound) & (filtered_df[col] <= upper_bound)]
    return filtered_df

filtered_subset = remove_outliers_iqr(subset, ['likes', 'followers', 'average_order'])
correlation_results = {}
for col in ['likes', 'followers', 'average_order']:
    r, p = pearsonr(filtered_subset['sentiment_score_compound'], filtered_subset[col])
    correlation_results[col] = {'r': r, 'p_value': p}

print("Pearson Correlation Results with 'sentiment_score_compound' (after outlier removal):")
for feature, result in correlation_results.items():
    print(f"{feature}: r = {result['r']:.3f}, p = {result['p_value']:.4f}")

# Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.regplot(data=filtered_subset, x='average_order', y='sentiment_score_compound', ax=axes[0], scatter_kws={'alpha':0.3})
axes[0].set_title('Sentiment vs. Average Order')
sns.regplot(data=filtered_subset, x='likes', y='sentiment_score_compound', ax=axes[1], scatter_kws={'alpha':0.3})
axes[1].set_title('Sentiment vs. Likes')
sns.regplot(data=filtered_subset, x='followers', y='sentiment_score_compound', ax=axes[2], scatter_kws={'alpha':0.3})
axes[2].set_title('Sentiment vs. Followers')
plt.tight_layout()
plt.show()


- **Relation between the political wing mentions and engagement**

In [None]:
# Group by %left
mean_by_ideology = df.groupby('% left')[['likes', 'followers']].mean().reset_index()

# Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(mean_by_ideology['% left'], mean_by_ideology['likes'], color='blue')
plt.title('Mean Likes by % Left')
plt.xlabel('% Left')
plt.ylabel('Mean Likes')

plt.subplot(1, 2, 2)
plt.plot(mean_by_ideology['% left'], mean_by_ideology['followers'], color='green')
plt.title('Mean Followers by % Left')
plt.xlabel('% Left')
plt.ylabel('Mean Followers')

r_likes, p_likes = pearsonr(mean_by_ideology['% left'], mean_by_ideology['likes'])
r_followers, p_followers = pearsonr(mean_by_ideology['% left'], mean_by_ideology['followers'])

print("Correlation with % left:")
print(f"Likes     → r = {r_likes:.3f}, p = {p_likes:.4f}")
print(f"Followers → r = {r_followers:.3f}, p = {p_followers:.4f}")
plt.tight_layout()
plt.show()

In [None]:
# Group by %right
mean_by_ideology = df.groupby('% right')[['likes', 'followers']].mean().reset_index()

# Plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(mean_by_ideology['% right'], mean_by_ideology['likes'], color='blue')
plt.title('Mean Likes by % right')
plt.xlabel('% right')
plt.ylabel('Mean Likes')

plt.subplot(1, 2, 2)
plt.plot(mean_by_ideology['% right'], mean_by_ideology['followers'], color='green')
plt.title('Mean Followers by % right')
plt.xlabel('% right')
plt.ylabel('Mean Followers')

r_likes, p_likes = pearsonr(mean_by_ideology['% right'], mean_by_ideology['likes'])
r_followers, p_followers = pearsonr(mean_by_ideology['% right'], mean_by_ideology['followers'])

print("Correlation with % right:")
print(f"Likes     → r = {r_likes:.3f}, p = {p_likes:.4f}")
print(f"Followers → r = {r_followers:.3f}, p = {p_followers:.4f}")
plt.tight_layout()
plt.show()

In [None]:
engagement_cols = ['ideology', 'likes', 'followers', 'average_order']
engagement_df = df[engagement_cols].dropna()

def bootstrap_ci(data, n_iterations=1000, ci=95):
    boot_means = []
    n = len(data)
    for _ in range(n_iterations):
        sample = data.sample(n=n, replace=True)
        boot_means.append(sample.mean())
    lower = np.percentile(boot_means, (100 - ci) / 2)
    upper = np.percentile(boot_means, 100 - (100 - ci) / 2)
    return np.mean(boot_means), lower, upper

metrics = ['likes', 'followers', 'average_order']
boot_results = []

for ideology in engagement_df['ideology'].unique():
    subset = engagement_df[engagement_df['ideology'] == ideology]
    for metric in metrics:
        mean_val, lower_ci, upper_ci = bootstrap_ci(subset[metric])
        boot_results.append({
            'ideology': ideology,
            'metric': metric,
            'mean': mean_val,
            'lower_ci': lower_ci,
            'upper_ci': upper_ci
        })

boot_df = pd.DataFrame(boot_results)

# Plot
fig, axes = plt.subplots(1, 3, figsize=(20, 7), sharey=False)
colors = ['#66c2a5', '#fc8d62', '#8da0cb']
titles = ['Mean Likes', 'Mean Followers', 'Mean Average Order']

for i, metric in enumerate(metrics):
    ax = axes[i]
    data_to_plot = boot_df[boot_df['metric'] == metric]
    sns.barplot(data=data_to_plot, x='ideology', y='mean', palette=[colors[i]]*len(data_to_plot), ax=ax)
    
    for idx, row in data_to_plot.iterrows():
        ax.errorbar(x=row['ideology'], y=row['mean'],
                    yerr=[[row['mean'] - row['lower_ci']], [row['upper_ci'] - row['mean']]],
                    fmt='none', c='black', capsize=5, capthick=1.5)
    
    ax.set_title(titles[i], fontsize=18)
    ax.set_xlabel('Ideology', fontsize=14)
    ax.set_ylabel(metric, fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=12)

plt.tight_layout()
plt.show()

In [None]:
country_map = {
    'es': 'Spain',
    'nl': 'Netherlands',
    'fr': 'France',
    'pl': 'Poland',
    'de': 'Germany'
}

df['country'] = df['countries'].str.strip().str.lower().map(country_map)

In [None]:
engagement_cols = ['ideology', 'likes', 'followers', 'average_order', 'country']
engagement_df = df[engagement_cols].dropna()
metrics = ['likes', 'followers', 'average_order']
titles = ['Likes', 'Followers', 'Average Order']
num_metrics = len(metrics)

fig, axes = plt.subplots(1, num_metrics, figsize=(6 * num_metrics, 6), sharey=False)

for i, metric in enumerate(metrics):
    ax = axes[i] if num_metrics > 1 else axes
    sns.violinplot(
        data=engagement_df,
        x='ideology',
        y=metric,
        hue='country',
        split=True,
        ax=ax,
        palette='Set2'
    )
    ax.set_title(f'{titles[i]} by Ideology and Country')
    ax.set_xlabel('Ideology')
    ax.set_ylabel(metric)
    ax.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

- **Relating subjectivity score to engagement**

In [None]:
df_clean = df[['subjectivity_score', 'average_order', 'likes', 'followers']].dropna()
correlation_order = df_clean['subjectivity_score'].corr(df_clean['average_order'])
correlation_likes = df_clean['subjectivity_score'].corr(df_clean['likes'])
correlation_followers = df_clean['subjectivity_score'].corr(df_clean['followers'])

print(f"Correlation between subjectivity and order of appearance: {correlation_order:.3f}")
print(f"Correlation between subjectivity and likes: {correlation_likes:.3f}")
print(f"Correlation between subjectivity and followers: {correlation_followers:.3f}")

plt.figure(figsize=(14, 6))

# Plot Subjectivity vs Order of Appearance
plt.subplot(1, 3, 1)
sns.regplot(data=df_clean, x='average_order', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Order of Appearance')
plt.xlabel('Order of Appearance')
plt.ylabel('Subjectivity')


# Plot Subjectivity vs Likes
plt.subplot(1, 3, 2)
sns.regplot(data=df_clean, x='likes', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Likes')
plt.xlabel('Likes')
plt.ylabel('Subjectivity')

# Plot Subjectivity vs Followers
plt.subplot(1, 3, 3)
sns.regplot(data=df_clean, x='followers', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Followers')
plt.xlabel('Followers')
plt.ylabel('Subjectivity')

plt.tight_layout()
plt.show()

- **Relating subjectivity score to engagement without outliers**

In [None]:
df_clean = df[['subjectivity_score', 'average_order', 'likes', 'followers']].dropna()

def remove_outliers_iqr(df, columns):
    filtered_df = df.copy()
    for col in columns:
        Q1 = filtered_df[col].quantile(0.25)
        Q3 = filtered_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_df = filtered_df[(filtered_df[col] >= lower_bound) & (filtered_df[col] <= upper_bound)]
    return filtered_df

df_clean = remove_outliers_iqr(df_clean, ['subjectivity_score', 'average_order', 'likes', 'followers'])

r_order, p_order = pearsonr(df_clean['subjectivity_score'], df_clean['average_order'])
r_likes, p_likes = pearsonr(df_clean['subjectivity_score'], df_clean['likes'])
r_followers, p_followers = pearsonr(df_clean['subjectivity_score'], df_clean['followers'])

# Print 
print("Pearson correlation results:")
print(f"Order of Appearance → r = {r_order:.3f}, p = {p_order:.4f}")
print(f"Likes               → r = {r_likes:.3f}, p = {p_likes:.4f}")
print(f"Followers           → r = {r_followers:.3f}, p = {p_followers:.4f}")

plt.figure(figsize=(14, 6))

# Plot Subjectivity vs Order of Appearance
plt.subplot(1, 3, 1)
sns.regplot(data=df_clean, x='average_order', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Order of Appearance')
plt.xlabel('Order of Appearance')
plt.ylabel('Subjectivity')

# Plot Subjectivity vs Likes
plt.subplot(1, 3, 2)
sns.regplot(data=df_clean, x='likes', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Likes')
plt.xlabel('Likes')
plt.ylabel('Subjectivity')

# Plot Subjectivity vs Followers
plt.subplot(1, 3, 3)
sns.regplot(data=df_clean, x='followers', y='subjectivity_score', scatter_kws={'alpha':0.3}, line_kws={'color': 'red'})
plt.title('Subjectivity vs Followers')
plt.xlabel('Followers')
plt.ylabel('Subjectivity')

plt.tight_layout()
plt.show()