In [None]:
import numpy as np
import pandas as pd
from pandas import json_normalize
import json
import math
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings("ignore")

FUNCTIONS

In [None]:
def generateTextDF(df):
    stopWords = stopwords.words('english')
    lemma  = WordNetLemmatizer()
    df['text'] = df['text'].str.replace("([^0-9a-zA-ZäöüÄÖÜß \t])|(\w+:\/\/\S+)", "", regex=True)
    df['text'] = df['text'].str.replace("[0-9]+[a-z]*", "", regex=True)
    df['text'] = df['text'].apply(lambda rawText: rawText.lower().split())
    df['text'] = df['text'].apply(lambda splittedText: [word for word in splittedText if not word in stopWords])
    df['text'] = df['text'].apply(lambda splittedText: ' '.join([str(item) for item in splittedText]))
    df['text'] = df['text'].apply(lambda splittedText: ' '.join(lemma.lemmatize(word) for word in splittedText.split()))
    return df
    
def generateVectorizedDF(df):
    vectorizer = CountVectorizer()
    corpus = df['text'].to_list()
    X = vectorizer.fit_transform(corpus)
    columns_name = vectorizer.get_feature_names()
    # index = df_text_analysis['created_at'].to_list()
    data = X.toarray()
    return pd.DataFrame(data=data, columns=columns_name)
    # return pd.DataFrame(data=data, index= index, columns=columns_name)

EXTRACT DATA / PREPROCESSING

In [None]:
df = pd.read_excel('../bellingcat_grouped_conversation_inclu_warPeriod_Final_lang_mode_thread.xlsx', index_col=0)

In [None]:
df_en_no_RT_only_thread = df[(df['lang'] == 'en') | (df['lang'] == 'und')]

In [None]:
# delete stopwords, wordNetLemmatizer, numbers and transform to lower case
df_text_analysis = generateTextDF(df_en_no_RT_only_thread.copy())
# generating a vectorized dataframe for LDA-Analysis
df_vectorized = generateVectorizedDF(df_text_analysis.copy())

In [None]:
#calculating the count of every word to get the topwords
sum = df_vectorized.sum(axis=0)
sum.name = "sum"
df_vectorized = df_vectorized.append(sum)
df_vectorized.drop(['bellingcat', 'via', 'bellingcats'], axis=1, inplace=True)

In [None]:
N = 30
top_N_words = df_vectorized.iloc[-1, :].sort_values(ascending=False).iloc[0:N]
bar = plt.bar(top_N_words.index, top_N_words.values)
plt.title(f'Top_{N}_words')
plt.bar_label(bar, rotation='vertical', padding=5)
plt.ylabel('Frequency')
plt.ylim(top=2500)
plt.xticks(rotation=90)
plt.show()

LDA-ANALYSIS

In [None]:
vocab_cv = df_vectorized.columns.to_list()
cv_arr = df_vectorized.iloc[:-1,:].to_numpy()

In [None]:
lda_model = LatentDirichletAllocation(n_components=10, max_iter=200, random_state=20)

X_topics = lda_model.fit_transform(cv_arr)

topic_words = lda_model.components_

In [None]:
n_top_words = 10
topics = []
for i, topic_dist in enumerate(topic_words):
    sorted_topic_dist = np.argsort(topic_dist)
    relevant_words = np.array(vocab_cv)[sorted_topic_dist]
    print("Topic", str(i+1), relevant_words[:-(n_top_words+1):-1])

In [None]:
doc_topic = lda_model.transform(cv_arr)
which_topic_for_which_tweet = []
for n in range(doc_topic.shape[0]):

    topic_doc = doc_topic[n].argmax()
    which_topic_for_which_tweet.append(topic_doc+1)

In [None]:
for i in range(10):
    print(f"{round((which_topic_for_which_tweet.count(i+1) * 100) / len(which_topic_for_which_tweet), 2)}% of the threads are assigned to topic {i+1}")

In [None]:
len(which_topic_for_which_tweet)

In [None]:
df_test = df_en_no_RT_only_thread[['text', 'conversation_id', 'id']]

In [None]:
df_test.insert(3, "Topic", which_topic_for_which_tweet)

In [None]:
df_test.to_excel('../bellingcat_tweets_with_topic.xlsx')