In [19]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [20]:
# Read data 
df = pd.read_csv('topic_dis.csv')

In [21]:
def convert_string_to_list(string):
    string = string.replace("[", "").replace("]", "")
    string = " ".join(string.split())
    string = string.split(" ")
    string = [float(i) for i in string]
    return string

df['topic_distribution'] = df['topic_distribution'].apply(convert_string_to_list)

In [23]:
# Filter data wrt to "journal" column distinct values
df_ny_times = df[df["journal"] == "NY Times"]
df_jerusalem = df[df["journal"] == "Jerusalem"]
df_guardian = df[df["journal"] == "Files(The Guardian_ UK)"] 
df_asharq_alawsat = df[df["journal"] == "Asharq Alawsat "]

In [24]:
# Function to convert string list to list of integers
topic_distribution_ny_times = df_ny_times["topic_distribution"].to_list()
topic_distribution_jerusalem = df_jerusalem["topic_distribution"].to_list()
topic_distribution_guardian = df_guardian["topic_distribution"].to_list()
topic_distribution_asharq_alawsat = df_asharq_alawsat["topic_distribution"].to_list()

In [25]:
# Given the four samples of topic distributions, run anova test for each of the three topics in the list of topics
from scipy.stats import f_oneway

# Topic 1
topic_1_ny_times = [i[0] for i in topic_distribution_ny_times]
topic_1_jerusalem = [i[0] for i in topic_distribution_jerusalem]
topic_1_guardian = [i[0] for i in topic_distribution_guardian]
topic_1_asharq_alawsat = [i[0] for i in topic_distribution_asharq_alawsat]

f_value_topic_1, p_value_topic_1 = f_oneway(topic_1_ny_times, topic_1_jerusalem, topic_1_guardian, topic_1_asharq_alawsat)

# Topic 2
topic_2_ny_times = [i[1] for i in topic_distribution_ny_times]
topic_2_jerusalem = [i[1] for i in topic_distribution_jerusalem]
topic_2_guardian = [i[1] for i in topic_distribution_guardian]
topic_2_asharq_alawsat = [i[1] for i in topic_distribution_asharq_alawsat]

f_value_topic_2, p_value_topic_2 = f_oneway(topic_2_ny_times, topic_2_jerusalem, topic_2_guardian, topic_2_asharq_alawsat)

# Topic 3
topic_3_ny_times = [i[2] for i in topic_distribution_ny_times]
topic_3_jerusalem = [i[2] for i in topic_distribution_jerusalem]
topic_3_guardian = [i[2] for i in topic_distribution_guardian]
topic_3_asharq_alawsat = [i[2] for i in topic_distribution_asharq_alawsat]

f_value_topic_3, p_value_topic_3 = f_oneway(topic_3_ny_times, topic_3_jerusalem, topic_3_guardian, topic_3_asharq_alawsat)

print("Topic 1: F-value: ", f_value_topic_1, "P-value: ", p_value_topic_1)
print("Topic 2: F-value: ", f_value_topic_2, "P-value: ", p_value_topic_2)
print("Topic 3: F-value: ", f_value_topic_3, "P-value: ", p_value_topic_3)

Topic 1: F-value:  13.103472887745056 P-value:  3.628524576174375e-08
Topic 2: F-value:  43.541369016140884 P-value:  2.8918653134817846e-24
Topic 3: F-value:  13.028132595527358 P-value:  4.0106820740696105e-08


In [26]:
import numpy as np
from scipy.stats import f_oneway
import pandas as pd

def calculate_means(topic_distribution, index):
    return [
        np.mean([i[index] for i in topic_distribution_ny_times]),
        np.mean([i[index] for i in topic_distribution_jerusalem]),
        np.mean([i[index] for i in topic_distribution_guardian]),
        np.mean([i[index] for i in topic_distribution_asharq_alawsat])
    ]

def run_anova(topic_distributions, index):
    return f_oneway(
        [i[index] for i in topic_distribution_ny_times],
        [i[index] for i in topic_distribution_jerusalem],
        [i[index] for i in topic_distribution_guardian],
        [i[index] for i in topic_distribution_asharq_alawsat]
    )

def generate_results_table():
    topics = ['Humanitarianism', 'Politics', 'Military']
    f_values = []
    p_values = []
    means_ny_times = []
    means_jerusalem = []
    means_guardian = []
    means_asharq_alawsat = []
    is_statistically_significant = []

    for index in range(3):
        f_value, p_value = run_anova(topic_distributions, index)
        means = calculate_means(topic_distributions, index)
        
        f_values.append(f_value)
        p_values.append(p_value)
        means_ny_times.append(means[0])
        means_jerusalem.append(means[1])
        means_guardian.append(means[2])
        means_asharq_alawsat.append(means[3])
        is_statistically_significant.append(p_value < 0.05)
    
    data = {
        'Topic': topics,
        'F-value': f_values,
        'P-value': p_values,
        'Is statistically significant': is_statistically_significant,
        'Mean NY Times': means_ny_times,
        'Mean Jerusalem': means_jerusalem,
        'Mean Guardian': means_guardian,
        'Mean Asharq Alawsat': means_asharq_alawsat

    }

    df = pd.DataFrame(data)
    return df

topic_distributions = [topic_distribution_ny_times, topic_distribution_jerusalem, topic_distribution_guardian, topic_distribution_asharq_alawsat]

# Generate the results table
df = generate_results_table()
display(df)

Unnamed: 0,Topic,F-value,P-value,Is statistically significant,Mean NY Times,Mean Jerusalem,Mean Guardian,Mean Asharq Alawsat
0,Humanitarianism,13.103473,3.628525e-08,True,0.35692,0.228443,0.394785,0.445626
1,Politics,43.541369,2.891865e-24,True,0.358665,0.641296,0.421414,0.257064
2,Military,13.028133,4.010682e-08,True,0.284415,0.130261,0.183801,0.29731


# Post-Hoc Analysis

In [39]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Function to run Tukey HSD test for a given topic
def run_tukey_hsd_for_topic(topic):
    topics = ['Humanitarianism', 'Politics', 'Military']
    topic_index = topics.index(topic)
    data = []
    labels = []
    journals = ["NY Times", "Jerusalem", "Guardian", "Asharq Alawsat"]
    
    for j, journal_data in enumerate(topic_distributions):
        for value in [i[topic_index] for i in journal_data]:
            data.append(value)
            labels.append(journals[j])
    
    tukey_result = pairwise_tukeyhsd(endog=data, groups=labels, alpha=0.05)
    return tukey_result


Tukey HSD results for Humanitarianism:
      Multiple Comparison of Means - Tukey HSD, FWER=0.05      
    group1       group2  meandiff p-adj   lower   upper  reject
---------------------------------------------------------------
Asharq Alawsat  Guardian  -0.0508 0.5309 -0.1479  0.0463  False
Asharq Alawsat Jerusalem  -0.2172    0.0 -0.3134 -0.1209   True
Asharq Alawsat  NY Times  -0.0887 0.0802 -0.1844   0.007  False
      Guardian Jerusalem  -0.1663    0.0  -0.257 -0.0757   True
      Guardian  NY Times  -0.0379 0.6988 -0.1279  0.0522  False
     Jerusalem  NY Times   0.1285 0.0013  0.0393  0.2176   True
---------------------------------------------------------------


In [None]:
# Post-hoc "Humanitarianism"
topic = 'Humanitarianism'
tukey_result = run_tukey_hsd_for_topic(topic)
print(f"\nTukey HSD results for {topic}:")
print(tukey_result)

# Post-hoc "Politics"
topic = 'Politics'
tukey_result = run_tukey_hsd_for_topic(topic)
print(f"\nTukey HSD results for {topic}:")
print(tukey_result)

# Post-hoc "Military"
topic = 'Military'
tukey_result = run_tukey_hsd_for_topic(topic)
print(f"\nTukey HSD results for {topic}:")
print(tukey_result)