In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import random
import spacy

from gensim import models, corpora
from gensim import similarities
from gensim.models.coherencemodel import CoherenceModel
from wordcloud import WordCloud

In [2]:
lda_model = models.LdaModel.load("lda_model.gensim")

In [3]:
dictionary = corpora.Dictionary.load("lda_dictionary.gensim")

In [4]:
corpus_bow = corpora.MmCorpus("lda_corpus.mm")  # Load corpus

In [5]:
len(dictionary)

31584

In [6]:
lda_model.print_topics()

[(0,
  '0.027*"family" + 0.015*"child" + 0.013*"life" + 0.012*"tell" + 0.012*"mother" + 0.011*"father" + 0.011*"day" + 0.011*"old" + 0.011*"son" + 0.010*"die"'),
 (1,
  '0.023*"film" + 0.018*"movie" + 0.014*"star" + 0.014*"music" + 0.012*"fan" + 0.011*"play" + 0.010*"book" + 0.010*"character" + 0.009*"song" + 0.009*"tv"'),
 (2,
  '0.020*"water" + 0.011*"people" + 0.011*"report" + 0.011*"ship" + 0.011*"oil" + 0.010*"fire" + 0.010*"storm" + 0.009*"area" + 0.007*"gas" + 0.007*"state"'),
 (3,
  '0.013*"country" + 0.010*"military" + 0.010*"russian" + 0.010*"official" + 0.009*"government" + 0.009*"nuclear" + 0.008*"israeli" + 0.007*"international" + 0.007*"leader" + 0.006*"talk"'),
 (4,
  '0.021*"police" + 0.013*"charge" + 0.013*"court" + 0.013*"case" + 0.012*"tell" + 0.012*"report" + 0.010*"authority" + 0.010*"arrest" + 0.009*"accord" + 0.008*"officer"'),
 (5,
  '0.026*"think" + 0.023*"know" + 0.020*"people" + 0.016*"go" + 0.015*"want" + 0.014*"thing" + 0.012*"good" + 0.012*"get" + 0.011*"w

In [8]:
# topic names extracted from LLM :
topic_names = {
    0: "Family & Relationships",
    1: "Entertainment & Media",
    2: "Environmental & Natural Disasters",
    3: "International Affairs & Military",
    4: "Crime & Law Enforcement",
    5: "Thoughts & Opinions",
    6: "Health & Drugs Research",
    7: "Education & School Safety",
    8: "War & Terrorism",
    9: "Elections & Politics",
    10: "Economy & Business",
    11: "Transportation & Aviation",
    12: "Technology & Computing",
    13: "Art & Aesthetics",
    14: "Law & Legal Issues",
    15: "Urban Life & Infrastructure",
    16: "Healthcare & Medicine",
    17: "Social Movements & Protests",
    18: "Gender & Identity",
    19: "Sports & Competitions"
}

In [None]:
with open('input_all_articles.txt', 'r', encoding='utf8') as f:
    articles = f.read().split('@delimiter')

In [21]:
article = articles[0]
article

' -- Children in war-ravaged Afghanistan are safer than those growing up in London or New York, NATO\'s top civilian envoy says.\n\nMark Sedwill, the senior civilian representative for NATO, made the comments on an episode of CBBC\'s "Newsround," which is airing Monday.\n\nIn the show -- a BBC current-affairs program for children -- several youngsters in Kabul, Afghanistan, say they are afraid of daily violence and the frequent explosions in their war-torn country.\n\nIn response, Sedwill says: "Here in Kabul, and other big cities actually, there are very few of these bombs. The children are probably safer here than they would be in London, New York or Glasgow or many other cities.\n\n"Most children can go about their lives in safety. It\'s a very family-oriented society. So it is a little bit like a city of villages," he added.\n\nA U.N. report released earlier this year seems to contradict Sedwill\'s assessment.\n\nThe February report, by the special representative for children and a

In [25]:
def get_topic_id_from_topic_name(topic_name):
    for key,value in topic_names.items():
        if(value == topic_name):
            return key

In [None]:
def get_topics_of_article(article_idx,min_topic_prob=0):
    topics = sorted(lda_model.get_document_topics(corpus_bow[article_idx],minimum_probability=min_topic_prob), key=lambda tup: tup[1])[::-1]
    final_topics = []
    for topic in topics:
        final_topics.append([topic_names[topic[0]],topic[1]])
    return final_topics

In [27]:
column_topics = []
column_topic_ids = []
column_article_ids = []
for i in range(len(articles)):
    topic_name = get_topics_of_article(i)[0][0]
    column_article_ids.append(i)
    column_topics.append(topic_name)
    column_topic_ids.append(get_topic_id_from_topic_name(topic_name))

In [32]:
df = pd.DataFrame({
    "article_id": column_article_ids,
    "topic_id":column_topic_ids,
    "topic_name":column_topics
})

In [33]:
df

Unnamed: 0,article_id,topic_id,topic_name
0,0,8,War & Terrorism
1,1,10,Economy & Business
2,2,2,Environmental & Natural Disasters
3,3,14,Law & Legal Issues
4,4,17,Social Movements & Protests
...,...,...,...
111420,111420,16,Healthcare & Medicine
111421,111421,13,Art & Aesthetics
111422,111422,13,Art & Aesthetics
111423,111423,5,Thoughts & Opinions


In [None]:
df.to_csv("output_labelled_dataset.csv",index=False)