In [None]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from tqdm import tqdm
import numpy as np

tqdm.pandas()

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/es1519/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('/vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/training_data/secondary.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54043 entries, 0 to 54042
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               54043 non-null  object
 1   comment_text     54043 non-null  object
 2   toxicity         54043 non-null  int64 
 3   severe_toxicity  54043 non-null  int64 
 4   obscene          54043 non-null  int64 
 5   threat           54043 non-null  int64 
 6   insult           54043 non-null  int64 
 7   identity_attack  54043 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 3.3+ MB


In [4]:
STOPWORDS = stopwords.words('english')
CUSTOM_STOPWORDS = set(['ukraine', 'russia', 'putin', 'joe', 'biden', 'usa', 'america', 'ukrainian', 'russian', 'war'])

In [5]:
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and token not in CUSTOM_STOPWORDS:
            result.append(token)
    return result

# Apply the preprocessing function to the comment_text column
data['processed_text'] = data['comment_text'].progress_apply(preprocess)

100%|██████████| 54043/54043 [00:03<00:00, 14413.08it/s]


In [6]:
texts = data['processed_text']

In [17]:
NUM_TOPICS = 15
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]
lda_model = LdaModel(tqdm(corpus), num_topics=NUM_TOPICS,
                     id2word=dictionary,
                     passes=4, alpha=[0.01]*NUM_TOPICS,
                     eta=[0.01]*len(dictionary.keys()))
topic_words = {}
for i, topic in lda_model.show_topics(num_topics=NUM_TOPICS, formatted=False):
    topic_words[i] = topic
    print('Topic {}: {}'.format(i, ', '.join([word[0] for word in topic])))

100%|██████████| 54043/54043 [00:00<00:00, 81549.26it/s]
100%|██████████| 54043/54043 [00:14<00:00, 3622.57it/s]


Topic 0: border, us, nato, crisis, protect, missiles, attacked, military, kazakhstan, com
Topic 1: china, invade, taiwan, potus, destroy, iran, stand, crimea, billion, obama
Topic 2: oil, us, gas, asking, weapons, bidenisafailure, funding, millions, forces, energy
Topic 3: invade, going, get, afghanistan, go, taliban, president, corruption, withdrawal, bidens
Topic 4: border, troops, trump, ukraines, wake, invasion, moving, southern, deployment, alert
Topic 5: us, want, like, pushing, nato, one, people, get, wants, know
Topic 6: government, us, states, united, coup, nazi, puppet, elected, civil, since
Topic 7: trump, would, nato, invade, wanted, president, said, remember, us, join
Topic 8: false, us, flag, attack, start, occupied, fear, provoke, starting, pretext
Topic 9: invade, president, says, said, us, invasion, days, could, going, made
Topic 10: troops, send, americans, us, sent, american, help, potus, escalating, fight
Topic 11: troops, us, europe, sending, news, via, uk, poland,

In [23]:
topic_tweets = {}

for i, tweet in enumerate(tqdm(data['comment_text'])):
    topic_probs = lda_model.get_document_topics(corpus[i])
    topic, prob = max(topic_probs, key=lambda x: x[1])
    if topic not in topic_tweets:
        topic_tweets[topic] = [(prob, tweet)]
    else:
        topic_tweets[topic].append((prob, tweet))


100%|██████████| 54043/54043 [00:12<00:00, 4245.03it/s]


In [34]:
for topic, tweets in sorted(topic_tweets.items()):
    print(f"Topic {topic}: {', '.join([word[0] for word in topic_words[topic]])}")
    print(f"{len(tweets)} tweets in topic")
    tweets = sorted(tweets, key=lambda x: x[0], reverse=True)
    for (prob, tweet) in tweets[:20]:
        print(f"\t{round(float(prob), 3)}: {tweet}")

Topic 0: border, us, nato, crisis, protect, missiles, attacked, military, kazakhstan, com
3643 tweets in topic
	0.993: POTUS You people have created the war. Why do you need Ukraine to join NATO. Why do you want to put missiles at Ukraine border. Is Russia trying to put missiles. at your border, at Mexico or some other countries. It is you that is always trying to expand near to Russian border
	0.993: POTUS Why is Russia on Ukraines border? To overt WW3 which is what Joe is about to start. The only thing Putin wants is for Ukraine not to be brought into NATO. Joe wants them in so he can put missiles on their border. Putin doesnt want them in to protect Russias Sovereignty.
	0.99: USA were ready to go to war to stop the Soviet Union from putting missiles in Cuba (1962) Now the same USA wants to put Ukraine in Nato and put missiles on the Russian border.
	0.989: Every country has the right to defend its territory...why the US wants to add Ukraine to NATO...No one wants their adversaries 

# Topics

* Topic 4: Trump supporting Putin for his action against Ukraine
* Topic 6: US is pressuring a war in Ukraine
* Topic 7: Trump weakend NATO and Ukraine
* Topic 10: Biden/POTUS refuses to help Americans in Ukraine