In [None]:
import numpy as np
import pandas as pd
import math
from pandas import json_normalize
import json
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore")

FUNCTIONS

In [None]:
def getTopMentions(series:pd.Series, times:int) -> Counter:
    listOfMentions = series.tolist()
    flattenListOfMentions = [j for mention in listOfMentions for j in mention]
    lowerCase = [mention.lower() for mention in flattenListOfMentions]
    print("Mentions: ", len(lowerCase))
    print(lowerCase)
    return Counter(lowerCase).most_common(times)

def plotTopMentions(mostCommon:Counter, title:str):
    bar = plt.bar(*zip(*mostCommon))
    plt.bar_label(bar)
    plt.xticks(rotation=90)
    plt.ylabel("Frequency")
    plt.title(title)
    plt.show()    

ANALYSE

In [None]:
# Extract data from json-File.

with open('../Final.json') as f:
    d = json.load(f)
df = json_normalize(d)

In [None]:
used_conversation_ids = pd.read_excel('../bellingcat_grouped_conversation_inclu_warPeriod_Final_lang_mode_thread.xlsx', index_col=0)

In [None]:
# only using tweets original tweeted from bellingcat or replies from bellingcat on they own tweets (threads)
only_en_conversation_ids = used_conversation_ids[used_conversation_ids['lang'] == 'en']['conversation_id'].astype('str')

In [None]:
df_en = df[df['conversation_id'].isin(only_en_conversation_ids)]

In [None]:
# get the percentage of threads, that are contain urls/photos/media

test = df_en.groupby('conversation_id').count()

# url photo media
url_photo_media = np.zeros(len(test))
for index, value in test[['entities.urls', 'attachments.media']].items():
    for id, item in enumerate(value):
        if(item > 0):
            url_photo_media[id] = 1



print(f"{(sum(url_photo_media)/len(test))*100}% contains urls/photos/media in the threads")

In [None]:
print(f"The complete dataset contain {len(df_en)} tweets.")

In [None]:
# Extract which tweet is retweet and which is no retweet

is_no_retweeted = []
for index, value in df_en['referenced_tweets'].items():
    if(type(value) == list and value[0].get('type') == "retweeted"):
        is_no_retweeted.append(False)
    else:
        is_no_retweeted.append(True)

In [None]:
# every tweet is used exepct of retweets and tweets who are not in english language
# so replys and quoted tweets are included in the analysis
df_en_no_RT =  df_en[is_no_retweeted]
df_en_no_RT.reset_index(inplace=True)

In [None]:
len(df_en_no_RT)

In [None]:
# Extract which tweet is retweet and which is no retweet
tweet_is_thread = []
for index, value in df_en_no_RT['in_reply_to_user_id'].items():
    if(math.isnan(float(value))):
        tweet_is_thread.append(True)
    elif(value == "2315512764"):
        tweet_is_thread.append(True)
    else:
        tweet_is_thread.append(False)

In [None]:
df_en_no_RT_only_thread = df_en_no_RT[tweet_is_thread]

In [None]:
len(df_en_no_RT_only_thread)

In [None]:
# get the amount of photos, animated gifs and videos from the dataset
amount_photos = 0
amount_animated_gifs = 0
amount_videos = 0
for index, value in df_en_no_RT_only_thread[~df_en_no_RT_only_thread['attachments.media'].isnull()]['attachments.media'].items(): 
    for media in value:
        if media.get('type') == "photo":
            amount_photos = amount_photos + 1
        if media.get('type') == "animated_gif":
            amount_animated_gifs = amount_animated_gifs + 1
        if media.get('type') == "video":
            amount_videos = amount_videos + 1


In [None]:
# get the amount of urls -> Mentions are urls as well
amount_urls = 0
for index, value in df_en_no_RT_only_thread[~df_en_no_RT_only_thread['entities.urls'].isnull()]['entities.urls'].items(): 
    amount_urls = amount_urls + len(value)

In [None]:
count_tweets = {}
count_tweets['urls'] = amount_urls
count_tweets['photos'] = amount_photos
count_tweets['animated_gifs'] = amount_animated_gifs
count_tweets['videos'] = amount_videos
count_tweets = {k: v for k, v in sorted(count_tweets.items(), key=lambda item: item[1], reverse=True)}
bar = plt.bar(count_tweets.keys(), count_tweets.values())
plt.title('Amount of Tweets with Media')
plt.bar_label(bar)
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
# get the mentions from the tweets and the username of the mentioned account
mentions = []
for index, value in df_en_no_RT_only_thread[~df_en_no_RT_only_thread['entities.mentions'].isnull()]['entities.mentions'].items():
    for mention in value:
        mentions.append(mention.get('username'))

In [None]:
most_common_amount = 10
most_common_mentions = Counter(mentions).most_common(most_common_amount)
plotTopMentions(most_common_mentions, f'TOP_{most_common_amount}_MENTIONS -> total {len(mentions)} mentions')

In [None]:
# get the hashtags from the tweets
hashtags = []
for index, value in df_en_no_RT_only_thread[~df_en_no_RT_only_thread['entities.hashtags'].isnull()]['entities.hashtags'].items():
    for hashtag in value:
        hashtags.append(hashtag.get('tag'))

In [None]:
most_common_hashtags = Counter(hashtags).most_common(most_common_amount)
plotTopMentions(most_common_hashtags, f'TOP_{most_common_amount}_HASHTAGS -> total {len(hashtags)} hashtags')

In [None]:
#Exporting data for plotting in R with ggplot2
pd.DataFrame(data=most_common_mentions, columns=['hashtag', 'count']).to_csv('../plotting_R/most_common_mentions.csv')
pd.DataFrame(data=most_common_hashtags, columns=['hashtag', 'count']).to_csv('../plotting_R/most_common_hashtags.csv')