In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp


In [3]:
subdirectory = 'merged_by_country_bert_processed_tweets'
merged_df = pd.DataFrame()

for filename in os.listdir(subdirectory):
    
    if filename.endswith('.csv'):
        file_path = os.path.join(subdirectory, filename)

        # Read the CSV file with UTF-8 encoding
        df = pd.read_csv(file_path, encoding='UTF-8', low_memory=False)

        merged_df = pd.concat([merged_df, df], ignore_index=True)

In [4]:
#BEGIN[ChatGPT][https://chat.openai.com/auth/login]

# I copy and pasted the ChatGPT to make a skeleton code and then I revised the code which did not work or weird

# command: We have a DataFrame (df) containing the Twitter users' stance on the Russia-Ukraine war. 
# The DataFrame has columns such as 'country' and 'sentiment' related to the war. 
# We want to identify the top 20 countries with the most users in the dataset and test the null hypothesis that people in these countries are not negatively biased towards the Russia-Ukraine war. 
# There will be a total of 20 null hypotheses.
# Also, in the 'sentiment' column, negative emotions are expressed as NEGATIVE and positive emotions as POSITIVE. Please convert negative sentiment to -1 and positive sentiment to 1

# explaination : This code analyzes the stance of Twitter users on the Russia-Ukraine war in a given DataFrame (df) containing columns like 'country' and 'sentiment'. 
# It identifies the top 20 countries with the most users in the dataset and tests the null hypothesis that people in these countries are not negatively biased towards the war.
# First, it converts the sentiments in the 'sentiment' column to -1 (negative) and 1 (positive). 
# Then, it calculates the number of users by country and selects the top 20 countries. 
# For each country, the code performs a null hypothesis test, calculates the negative sentiment ratio, and uses a t-test to verify the null hypothesis. 
# The results indicate whether users in each country are negative or not negative towards the war



# Convert sentiment values to -1 for 'NEGATIVE' and 1 for 'POSITIVE'
df['sentiment'] = df['sentiment'].map({'NEGATIVE': -1, 'POSITIVE': 1})

# Calculate the number of users by country and select the top 20 countries
top_20_countries = df['country'].value_counts().head(20).index.tolist()

# Initialize a counter for rejected null hypothesis
rejected_count = 0

# Perform null hypothesis testing for each country
for country in top_20_countries:
    country_df = df[df['country'] == country]
    negative_sentiment_ratio = country_df[country_df['sentiment'] == -1].shape[0] / country_df.shape[0]
    
    # Null hypothesis: The ratio of negative sentiment is less than 0.5 (not negative towards the war)
    # Alternative hypothesis: The ratio of negative sentiment is greater or equal to 0.5 (negative towards the war)
    t_statistic, p_value = ttest_1samp(country_df['sentiment'], 0.5)
    
    # Print results
    print(f"Country: {country}")
    print(f"Negative sentiment ratio: {negative_sentiment_ratio:.2f}")
    print(f"t-statistic: {t_statistic:.2f}, p-value: {p_value:.2f}")
    if p_value < 0.05:
        print("Reject null hypothesis: Users in this country are negative towards the war.\n")
        rejected_count += 1
    else:
        print("Accept null hypothesis: Users in this country are not negative towards the war.\n")

# Calculate the ratio of countries that rejected null hypothesis
rejected_ratio = rejected_count / len(top_20_countries)
print(f"The ratio of countries rejecting null hypothesis: {rejected_ratio:.2f}")

#END[ChatGPT]


Country: United States
Negative sentiment ratio: 0.74
t-statistic: -18.33, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: Ukraine
Negative sentiment ratio: 0.61
t-statistic: -10.20, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: Pakistan
Negative sentiment ratio: 0.04
t-statistic: 12.16, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: India
Negative sentiment ratio: 0.63
t-statistic: -8.81, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: Japan
Negative sentiment ratio: 0.61
t-statistic: -6.42, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: Denmark
Negative sentiment ratio: 0.76
t-statistic: -9.27, p-value: 0.00
Reject null hypothesis: Users in this country are negative towards the war.

Country: United Kingdom
Negative sentimen