In [None]:

#import aggregation as aggregation
import pandas as pd
import numpy as np
#plotting
import matplotlib.pyplot as plt
import matplotlib.dates as mdates 
import matplotlib.cbook as cbook
import matplotlib.dates as mdates
import plotly.express as px

#Sentimentanalysis - Dictionary Approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

import nest_asyncio
import re
from matplotlib import pyplot as plt 
import seaborn as sns

#Date manipulation
import time
from datetime import datetime, date, time, timedelta
import dateutil.relativedelta

In [None]:
df1 = pd.read_excel('../../DataSources/bellingcat_grouped_conversation_inclu_warPeriod_Final_lang_mode_thread_mention.xlsx')
# Sentiment Analysis only about "normal Tweets & Threads"


# Sentimentanalyse with vader regarding Bellingcats conversations


In [None]:
df1.info()

In [None]:
#analyse only english -> that`s also a limitation because
#e.g. russian replys are not taken into account if the are written in Cyrillic
df1 = df1[df1.lang == 'en']
df1.info()

In [None]:
def preprocess_tweets(text):
    fo = open("Stopwords-en.txt","+r")
    stop_words = list(fo.read().split(','))
    translation={39:None}
    processed_tweet = text
    processed_tweet=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in str(stop_words).translate(translation))
    return(processed_tweet)

df1['Processed Tweet'] = df1['text'].apply(lambda x: preprocess_tweets(x.lower()))

In [None]:
#second opportunity of data cleaning (I am using now the first one)
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    
    return tweets

In [None]:
#Clean Rows where Processed Tweet is Empty (e.g. because if it is only a Link)
def EmptyRows(dataset): 
    nan_value = float("NaN")
    dataset.replace("",nan_value, inplace=True)
    dataset.dropna(subset = ['Processed Tweet'], inplace= True)

In [None]:
#Split date into year,month day date-columns
def SplitDate(dataset):
    dataset['year']=pd.to_datetime(dataset['date']).dt.strftime('%Y')
    dataset['month']=pd.to_datetime(dataset['date']).dt.strftime('%m')
    dataset['day']=pd.to_datetime(dataset['date']).dt.strftime('%A')
    dataset['year-month']=pd.to_datetime(dataset['date']).dt.strftime('%Y-%m')

In [None]:
EmptyRows(df1)

In [None]:
df1['polarity'] = df1['Processed Tweet'].apply(lambda x: analyzer.polarity_scores(x))

In [None]:
def sentimentPredict(sentiment):
    if sentiment['compound'] >= 0.05:
        return "Positive"
    elif sentiment['compound'] <= -0.05: 
        return "Negative"
    else:
        return "Neutral"

df1['sentiment'] =df1['polarity'].apply(lambda x: sentimentPredict(x))

In [None]:
#for creating columns for each dic item in polarity
df1 = pd.concat([df1.drop(['polarity'], axis=1), df1['polarity'].apply(pd.Series)], axis=1)

In [None]:
df1.head()

In [None]:
df_sentiment = df1[["media_photo_Binary",'media_videos_Binary','urls_Binary','hashtags_Binary',"compound","quantiles"]]
print(df_sentiment.head())
df_sentiment = df_sentiment[(df_sentiment["quantiles"]=="Q1")|(df_sentiment["quantiles"]=="Q4")]
df_sentiment['quantiles'] = df_sentiment['quantiles'].replace('Q1', 0)
df_sentiment['quantiles'] = df_sentiment['quantiles'].replace('Q4', 1)
print(df_sentiment.head())
df_sentiment.to_csv("../../DataSources/Dataset_Graphs/RQ2_Content_Engagement/engagement_with_sentiment.csv")

In [None]:
plt.figure(figsize=(6,5))
plt.title('Classification of Bot Replys into sentiment categories',fontsize=15)
plt.ylabel('Percentage [%]',fontsize=18)
ax = (df1.sentiment.value_counts()/len(df1)*100).plot(kind="bar", rot=0,color=['#04407F','#0656AC','#0A73E1'])
ax.set_yticks(np.arange(0, 110, 10))
plt.grid(color='#95a5a6', linestyle='-.', linewidth=1, axis='y', alpha=0.7)
ax2 = ax.twinx()
ax2.set_yticks(np.arange(0, 110, 10)*len(df1)/100)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))

In [None]:
SplitDate(df1)
df1.head()

In [None]:
grouped = df1.groupby(by='year')['sentiment'].value_counts()
#grouped_class=df3.groupby(by='year')['type_of_tweet'].value_counts()

In [None]:
unstacked = grouped.unstack(level=1)
#grouped_class = grouped.unstack(level=1)
unstacked

In [None]:
unstacked.plot.bar(figsize=(18,12),title="Absolute distribution of Sentiments in Sentiment Conversations during ukraine war")
plt.xticks(rotation=45)
#grouped_class.plot.bar(figsize=(18,12))ß

In [None]:
cross_tab_prop2 = pd.crosstab(index=df1['year'],columns=df1['sentiment'], normalize='index')
cross_tab_prop2

In [None]:
cross_tab_prop2.plot(kind='bar',stacked=True,figsize=(18,12))
plt.legend(loc="upper left", ncol=2)
plt.title('Relative distribution of Sentiments in Bellingcats Conversations',fontsize=15)
plt.xlabel("Year")
plt.ylabel("Proportion")
plt.xticks(rotation=45)

for n, x in enumerate([*cross_tab_prop2.index.values]):
    for (proportion, y_loc) in zip(cross_tab_prop2.loc[x],
                                   cross_tab_prop2.loc[x].cumsum()):
                
        plt.text(x=n - 0.17,
                 y=y_loc,
                 s=f'{np.round(proportion * 100, 1)}%', 
                 color="black",
                 fontsize=20,
                 fontweight="bold")

plt.show()