In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
import datetime
from datetime import datetime as datetime_1

In [None]:
np.random.seed(1)

df_bellingcat = pd.read_excel("../../DataSources/bellingcat_grouped_conversation_inclu_warPeriod_Final_lang_mode_thread_mention.xlsx",dtype = {'conversation_id': str,"id":str})
df_bellingcat
print(df_bellingcat.head())


In [None]:
df_bellingcat = df_bellingcat[df_bellingcat['lang']=="en"]
df_bellingcat['year'] = df_bellingcat.date.dt.year
df_bellingcat['month'] = df_bellingcat.date.dt.month
print(df_bellingcat.head())

## Followers

In [None]:
df_followers= pd.read_excel("../../DataSources/Followers_final.xlsx")
df_followers = df_followers[["Date","followers"]].groupby([pd.Grouper(freq="M",key="Date")])["followers"].max().reset_index()
print(df_followers.head())


In [None]:
df_followers['year'] = df_followers.Date.dt.year
df_followers['month'] = df_followers.Date.dt.month
print(df_followers.head())

In [None]:
df_bellingcat=df_bellingcat.merge(df_followers, on=["year","month"], how='left').drop(columns=['month', 'year', 'Date'])

In [None]:
print(df_bellingcat.head())

In [None]:
df_bellingcat=df_bellingcat[df_bellingcat['date']>=datetime.datetime(year=2014,month=7,day=1)]

In [None]:
print(df_bellingcat.tail())

In [None]:
df_bellingcat.dropna(subset=['followers'],inplace=True)

In [None]:
df_bellingcat

In [None]:
df_bellingcat["total_engagement"] = df_bellingcat["likes"]+df_bellingcat["replies"]+df_bellingcat["quotes"]+df_bellingcat["retweets"]
df_bellingcat["total_engagement_per_follower"] = df_bellingcat["total_engagement"] / df_bellingcat["followers"]
df_bellingcat['Image'] = np.where(df_bellingcat['media_photo'] > 0, "Image", "No Image")
df_bellingcat['Video'] = np.where(df_bellingcat['media_videos'] > 0, "Video", "No Video")
df_bellingcat['Mentions'] = np.where(df_bellingcat['mentions'] > 0, "Mentions", "No Mentions")
df_bellingcat['media_animated_gif_Binary'] = np.where(df_bellingcat['media_animated_gif'] > 0, 1, 0)
df_bellingcat['media_review_image_url_Binary'] = np.where(df_bellingcat['media_review_image_url'] > 0, 1, 0)
df_bellingcat["media_Binary"] = np.where((  (df_bellingcat['media_photo'] > 0) | (df_bellingcat['media_videos'] > 0)), "Media", "No Media")
df_bellingcat["media_Binary_numeric"] = np.where(((df_bellingcat['media_photo'] > 0) | (df_bellingcat['media_videos'] > 0)), 1, 0)
df_bellingcat["total_media"] =   df_bellingcat['media_photo'] + df_bellingcat['media_videos']
df_bellingcat['URL'] = np.where(df_bellingcat['urls'] > 0, "URL", "No URL")
df_bellingcat['Hashtags'] = np.where(df_bellingcat['hashtags'] > 0, "Hashtags", "No Hashtags")
df_bellingcat['log_engagement_per_post_per_follower']=np.log((df_bellingcat["total_engagement_per_follower"]+1)/df_bellingcat["count"])
df_bellingcat['engagement_per_post_per_follower']=(df_bellingcat["total_engagement_per_follower"])/df_bellingcat["count"]
df_bellingcat["likes_log"]=np.log(((df_bellingcat["likes"]+1)/ df_bellingcat["followers"])/df_bellingcat["count"])
df_bellingcat["conversation_binary"]=np.where(df_bellingcat['count'] > 1, "Thread", "Single")
df_bellingcat["replies_log"]=np.log(((df_bellingcat["replies"]+1)/ df_bellingcat["followers"])/df_bellingcat["count"])
df_bellingcat["quotes_log"]=np.log(((df_bellingcat["quotes"]+1)/ df_bellingcat["followers"])/df_bellingcat["count"])
df_bellingcat["retweets_log"]=np.log(((df_bellingcat["retweets"]+1)/ df_bellingcat["followers"])/df_bellingcat["count"])
df_bellingcat["normlog_engagement_per_follower"]=np.log(df_bellingcat["total_engagement_per_follower"]+1)

In [None]:
list_attachments = ["Image","Video","URL","Hashtags","Mentions"]

for type in list_attachments: 
    without_attachment = df_bellingcat[(df_bellingcat[type]!=type)]["engagement_per_post_per_follower"].median()
    with_attachment = df_bellingcat[(df_bellingcat[type]==type)]["engagement_per_post_per_follower"].median()
    without_attachment_len = len(df_bellingcat[(df_bellingcat[type]!=type)]["engagement_per_post_per_follower"])
    with_attachment_len = len(df_bellingcat[(df_bellingcat[type]==type)]["engagement_per_post_per_follower"])
    
    percentage = (with_attachment-without_attachment)/without_attachment
    print(type,with_attachment_len,without_attachment_len,percentage)

In [None]:
list_attachments = ["Image","Video","URL","Hashtags","Mentions"]

for type in list_attachments: 
    x = df_bellingcat[(df_bellingcat[type]!=type)]["engagement_per_post_per_follower"]
    y = df_bellingcat[(df_bellingcat[type]==type)]["engagement_per_post_per_follower"]
    
    U1, p = mannwhitneyu(x, y, method="auto",alternative="less")
    print(type,U1,p)

In [None]:
df_bellingcat.to_csv("../../DataSources/Dataset_Graphs/RQ2_Content_Engagement/CCDF.csv")

## Sentiment

In [None]:
#import aggregation as aggregation
import pandas as pd
import numpy as np
#plotting
import matplotlib.pyplot as plt
import matplotlib.dates as mdates 
import matplotlib.cbook as cbook
import matplotlib.dates as mdates
import plotly.express as px




#Sentimentanalysis - Dictionary Approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

import nest_asyncio
import re
from matplotlib import pyplot as plt 
import seaborn as sns

#Date manipulation
import time
from datetime import datetime, date, time, timedelta
import dateutil.relativedelta

In [None]:
df1 = df_bellingcat.copy()

In [None]:
def preprocess_tweets(text):
    fo = open("Stopwords-en.txt","+r")
    stop_words = list(fo.read().split(','))
    translation={39:None}
    processed_tweet = text
    processed_tweet=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in str(stop_words).translate(translation))
    return(processed_tweet)

df1['Processed Tweet'] = df1['text'].apply(lambda x: preprocess_tweets(x.lower()))

In [None]:
#second opportunity of data cleaning (I am using now the first one)
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    
    return tweets

In [None]:
#Clean Rows where Processed Tweet is Empty (e.g. because if it is only a Link)
def EmptyRows(dataset): 
    nan_value = float("NaN")
    dataset.replace("",nan_value, inplace=True)
    dataset.dropna(subset = ['Processed Tweet'], inplace= True)

In [None]:
#Split date into year,month day date-columns
def SplitDate(dataset):
    dataset['year']=pd.to_datetime(dataset['date']).dt.strftime('%Y')
    dataset['month']=pd.to_datetime(dataset['date']).dt.strftime('%m')
    dataset['day']=pd.to_datetime(dataset['date']).dt.strftime('%A')
    dataset['year-month']=pd.to_datetime(dataset['date']).dt.strftime('%Y-%m')

In [None]:
EmptyRows(df1)

In [None]:
df1['polarity'] = df1['Processed Tweet'].apply(lambda x: analyzer.polarity_scores(x))

In [None]:
def sentimentPredict(sentiment):
    if sentiment['compound'] >= 0.05:
        return "Positive"
    elif sentiment['compound'] <= -0.05: 
        return "Negative"
    else:
        return "Neutral"

df1['sentiment'] =df1['polarity'].apply(lambda x: sentimentPredict(x))

In [None]:
#for creating columns for each dic item in polarity
df1 = pd.concat([df1.drop(['polarity'], axis=1), df1['polarity'].apply(pd.Series)], axis=1)

In [None]:
df1.head()

In [None]:
list_polarity = ["Negative","Neutral","Positive"]

for type in list_polarity: 
    median_eng = df1[(df1["sentiment"]==type)]["engagement_per_post_per_follower"].median()
    print(type,median_eng)

In [None]:
median_sentiment = df1.groupby("sentiment")["engagement_per_post_per_follower"].median().reset_index()
median_sentiment

In [None]:
median_sentiment.to_csv("../../DataSources/Dataset_Graphs/RQ2_Content_Engagement/median_sentiment.csv")

In [None]:
total_sentiment = df1.groupby("sentiment")["total_engagement"].sum().reset_index()
total_sentiment
total_sentiment.to_csv("../../DataSources/Dataset_Graphs/RQ2_Content_Engagement/total_sentiment.csv")

In [None]:
df1.to_csv("../../DataSources/Dataset_Graphs/RQ2_Content_Engagement/CCDF_Sentiment.csv")

## Double check of results for video

In [None]:
from scipy import stats
import random
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
v_value_list = []

for x in range(1,1000):
    video=random.sample(df_bellingcat[df_bellingcat["Video"]=="Video"]["engagement_per_post_per_follower"].tolist(),200)
    not_video=random.sample(df_bellingcat[df_bellingcat["Video"]!="Video"]["engagement_per_post_per_follower"].tolist(),200)

    v_value_list.append(stats.kstest(video,not_video).pvalue)


plt.hist(v_value_list, density=True, bins=30)


In [None]:
# Quantile-quantile plot
video=random.sample(df_bellingcat[df_bellingcat["Video"]=="Video"]["engagement_per_post_per_follower"].tolist(),200)
not_video=random.sample(df_bellingcat[df_bellingcat["Video"]!="Video"]["engagement_per_post_per_follower"].tolist(),200)
plt.figure()
plt.scatter(np.sort(video), np.sort(not_video))
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
plt.close()