In [34]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from tqdm import tqdm

tqdm.pandas()

In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/es1519/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
data = pd.read_csv('/vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/training_data/secondary.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54043 entries, 0 to 54042
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               54043 non-null  object
 1   comment_text     54043 non-null  object
 2   toxicity         54043 non-null  int64 
 3   severe_toxicity  54043 non-null  int64 
 4   obscene          54043 non-null  int64 
 5   threat           54043 non-null  int64 
 6   insult           54043 non-null  int64 
 7   identity_attack  54043 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 3.3+ MB


In [42]:
STOPWORDS = stopwords.words('english')
CUSTOM_STOPWORDS = set(['ukraine', 'russia', 'putin', 'joe', 'biden', 'usa', 'america', 'ukrainian', 'russian', 'war'])

In [43]:
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and token not in CUSTOM_STOPWORDS:
            result.append(token)
    return result

# Apply the preprocessing function to the comment_text column
data['processed_text'] = data['comment_text'].progress_apply(preprocess)

100%|██████████| 54043/54043 [00:03<00:00, 14534.77it/s]


In [44]:
texts = data['processed_text']

In [45]:
NUM_TOPICS = 15
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]
lda_model = LdaModel(tqdm(corpus), num_topics=NUM_TOPICS,
                     id2word=dictionary,
                     passes=4, alpha=[0.01]*NUM_TOPICS,
                     eta=[0.01]*len(dictionary.keys()))
for i, topic in lda_model.show_topics(num_topics=NUM_TOPICS, formatted=False):
    print('Topic {}: {}'.format(i, ', '.join([word[0] for word in topic])))

100%|██████████| 54043/54043 [00:00<00:00, 109809.84it/s]
100%|██████████| 54043/54043 [00:14<00:00, 3632.57it/s]


Topic 0: troops, us, europe, sending, eastern, border, poland, invasion, via, forces
Topic 1: oil, gas, sanctions, attacked, pipeline, crisis, missile, money, dollars, energy
Topic 2: us, uk, news, cia, nato, co, conflict, eu, john, overthrow
Topic 3: states, united, nato, military, withdrawal, reasons, possible, administration, backing, certain
Topic 4: us, government, coup, nazis, elected, puppet, civil, nazi, people, funding
Topic 5: china, invade, false, flag, obama, taiwan, remember, violent, son, president
Topic 6: invade, president, says, said, us, could, would, news, via, invasion
Topic 7: crime, stand, moves, white, afghanistan, defending, economy, house, gt, top
Topic 8: us, weapons, stop, want, like, seen, kyiv, arms, sell, secblinken
Topic 9: border, troops, us, get, country, protect, go, people, american, feel
Topic 10: nato, us, would, join, invade, part, put, also, wanted, want
Topic 11: nato, us, potus, back, china, iran, world, provoking, foreign, destroy
Topic 12: inv