Import the dataset

In [None]:
import pandas as pd
df = pd.read_csv("final_dataset.csv")
df.head()

In [None]:
df.info()

# Correlation Matrix

Discard the non-numeric data before generating it

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Keep only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Compute the correlation matrix
# correlation_matrix = numeric_df.corr()
correlation_matrix = numeric_df.corr(method="spearman")

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()


# Correlate each of the feature to the political ideology of the content

In [None]:
import pandas as pd

# Encode ideology: left = -1, right = 1
df["ideology_encoded"] = df["ideology"].map({"left": -1, "right": 1})

# Select only numerical columns for correlation
features = [
    "sentiment_score", "subjectivity", "pain", "movement", "negative_emotion", 
    "religion", "violence", "government", "independence", "fear", "trust", 
    "leader", "pro_stance", "moral_dilemma", "misinformation", "human_rights", 
    "abortion_rights", "war_justification", "womens_rights", "likes", "followers"
]

# Compute correlation with ideology
correlation_with_ideology = df[features + ["ideology_encoded"]].corr()["ideology_encoded"].sort_values(ascending=False)

# Display correlation results
print(correlation_with_ideology)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a figure
plt.figure(figsize=(10, 8))

# Plot correlations as a horizontal bar chart
sns.barplot(
    x=correlation_with_ideology.values, 
    y=correlation_with_ideology.index, 
    hue=correlation_with_ideology.index,  # Assign hue based on features
    palette="coolwarm",  # Use color palette
    dodge=False  # No separation between bars, just color variations
)

# Add title and labels
plt.title("Correlation Between Features and Ideology (Left = -1, Right = 1)", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.axvline(x=0, color="black", linestyle="--")  # Add a reference line at 0
plt.grid(axis="x", linestyle="--", alpha=0.6)

# Show the plot
plt.show()


# Visibility order and topics

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of features to correlate with average_order
values_to_relate = [
    "sentiment_score", "subjectivity", "pain", "movement", "negative_emotion", 
    "religion", "violence", "government", "independence", "fear", "trust", 
    "leader", "pro_stance", "moral_dilemma", "misinformation", "human_rights", 
    "abortion_rights", "war_justification", "womens_rights", "likes", "followers"
]

# Compute correlation of average_order with these features
correlation_with_average_order = df[values_to_relate + ["average_order"]].corr()["average_order"].sort_values(ascending=False)

# Display correlation results
print(correlation_with_average_order)

# Plot correlations with average_order as a horizontal bar chart
plt.figure(figsize=(10, 8))
sns.barplot(
    x=correlation_with_average_order.values, 
    y=correlation_with_average_order.index, 
    palette="coolwarm"
)

# Add title and labels
plt.title("Correlation Between Features and Average Order", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.axvline(x=0, color="black", linestyle="--")  # Reference line at 0
plt.grid(axis="x", linestyle="--", alpha=0.6)

# Show the plot
plt.show()

# TODO: Quitar los no significantes


# How does each country talk about each topic?

Top 1 per country

In [None]:
# List of emotion-related columns
emotion_columns = [
    "negative_emotion", "fear", "trust", "pain", "movement", "religion", "violence",
    "government", "independence", "leader", "pro_stance", "moral_dilemma", "misinformation",
    "human_rights", "abortion_rights", "war_justification", "womens_rights"
]

# Create a function to get the most common emotion for each country and topic
def get_most_common_emotion(group):
    # Calculate the mean for each emotion column
    mean_emotions = group[emotion_columns].mean()
    
    # Find the emotion with the highest average value
    most_common_emotion = mean_emotions.idxmax()
    highest_value = mean_emotions.max()
    
    return pd.Series({
        'most_common_emotion': most_common_emotion,
        'emotion_value': highest_value
    })

# Apply the function to group by country and topic
common_emotions = df.groupby(['countries', 'topic']).apply(get_most_common_emotion).reset_index()

# Create a pivot table for the heatmap visualization
pivot_table = common_emotions.pivot(index="countries", columns="topic", values="most_common_emotion")

# Create a numeric mapping for the emotions (only for heatmap coloring purposes)
emotion_map = {emotion: idx for idx, emotion in enumerate(emotion_columns)}

# Map the emotions in the common_emotions DataFrame to their numeric values for visualization
common_emotions['emotion_numeric'] = common_emotions['most_common_emotion'].map(emotion_map)

# Create a pivot table with numeric emotion values
pivot_table_numeric = common_emotions.pivot(index="countries", columns="topic", values="emotion_numeric")

# Visualize as a heatmap with numeric values (using annotations for emotion names)
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table_numeric, annot=pivot_table, cmap="coolwarm", cbar=True, fmt="s", 
            linewidths=0.5, annot_kws={"size": 8})
plt.title("Most Common Emotion by Country and Topic")
plt.xlabel("Topic")
plt.ylabel("Country")
plt.show()


Top 5 per country

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# List of emotion-related columns
emotion_columns = [
    "negative_emotion", "fear", "trust", "pain", "movement", "religion", "violence",
    "government", "independence", "leader", "pro_stance", "moral_dilemma", "misinformation",
    "human_rights", "abortion_rights", "war_justification", "womens_rights"
]

# Create a function to get the top 5 emotions for each country and topic
def get_top_5_emotions(group):
    # Calculate the mean for each emotion column
    mean_emotions = group[emotion_columns].mean()
    
    # Sort the emotions by their average value in descending order
    sorted_emotions = mean_emotions.sort_values(ascending=False)
    
    # Get the top 5 emotions
    top_5_emotions = sorted_emotions.head(5)
    
    return pd.Series({
        'top_5_emotions': top_5_emotions.index.tolist(),
        'top_5_values': top_5_emotions.values.tolist()
    })

# Apply the function to group by country and topic
top_emotions = df.groupby(['countries', 'topic']).apply(get_top_5_emotions).reset_index()

# Create a table that lists the top 5 emotions
top_emotions_table = top_emotions.pivot(index="countries", columns="topic", values="top_5_emotions")

# Format the table to display emotions as comma-separated strings
top_emotions_table = top_emotions_table.applymap(lambda x: ", ".join(x))

# Plotting the top emotions as a table
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('tight')
ax.axis('off')

# Create a table within the plot
table = ax.table(cellText=top_emotions_table.values,
                rowLabels=top_emotions_table.index,
                colLabels=top_emotions_table.columns,
                loc='center', cellLoc='center')

# Customize the table appearance
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1.5, 1.5)

plt.title("Top 5 Emotions by Country and Topic")
plt.show()
