In [11]:
#to read the file
import pandas as pd

# to preprocess the file
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# to model the topic
import gensim
from gensim import corpora
from gensim.models import LdaMulticore

In [3]:
df = pd.read_csv("news_dataset.csv")
df['text'] = df['text'].astype(str)

In [4]:
df.shape

(11314, 5)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043


In [6]:
df.duplicated().sum()

0

In [7]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
#declare a var called stemmer to store stemming value in
stemmer = nltk.SnowballStemmer("english")

In [9]:
# preprocessing function 
def preprocess(text):
    text = text.lower()  
    text = nltk.word_tokenize(text)
    text = re.sub(r'\[.*?\]', '', str(text))  
    text = re.sub(r'http\S+\s*\S+', '', text)  
    text = re.sub(r'www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    text = ' '.join(stemmer.stem(word) for word in text.split(' ')) 
    return text

In [10]:
#apply the basic text preprocessing
df['text_clean'] = df['text'].apply(preprocess)

In [12]:
df['text_clean']

0                                                         
1         motorola cellular subscrib group   usa cc    ...
2                                                         
3                                                         
4                                                         
                               ...                        
11309                                                     
11310                                                     
11311                                                     
11312                                                     
11313                                                     
Name: text_clean, Length: 11314, dtype: object

In [13]:
#convert my dataframe to a list
documents = df['text_clean'].tolist()

# Tokenize the docments
tokenized_docs = [[token for token in doc.split()] for doc in documents]

# Create the dictionary
dictionary = corpora.Dictionary(tokenized_docs) 

# Convert the documents into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

In [14]:
# Train the LDA model with 4 topics
lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = dictionary,
                                       num_topics = 4)

In [15]:
# Print the topics
print(lda_model.print_topics())

[(0, '0.119*"ax" + 0.024*"x" + 0.006*"max" + 0.005*"file" + 0.005*"use" + 0.005*"nt" + 0.003*"would" + 0.003*"program" + 0.003*"one" + 0.003*"peopl"'), (1, '0.015*"ax" + 0.009*"nt" + 0.006*"one" + 0.006*"use" + 0.006*"x" + 0.004*"peopl" + 0.004*"would" + 0.003*"say" + 0.003*"time" + 0.003*"god"'), (2, '0.215*"ax" + 0.018*"max" + 0.007*"x" + 0.005*"use" + 0.004*"w" + 0.004*"nt" + 0.004*"q" + 0.004*"one" + 0.004*"p" + 0.003*"_"'), (3, '0.036*"ax" + 0.006*"nt" + 0.006*"one" + 0.006*"use" + 0.003*"like" + 0.003*"max" + 0.003*"get" + 0.003*"peopl" + 0.003*"would" + 0.003*"file"')]


In [17]:
# Iterate over each processed document
doc_topics = []
for doc_lda in lda_model[corpus]:
    
    # Convert the document to a box representation
    doc_box = [0] * lda_model.num_topics
    for topic_id, topic_prob in doc_lda:
        doc_box[topic_id] = topic_prob
    
    # Determine the topic with the highest probability
    dominant_topic = doc_box.index(max(doc_box))
    
    # Append the dominant topic to the list
    doc_topics.append(dominant_topic)

In [18]:
print(doc_topics)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
# Create DataFrame
df = pd.DataFrame({"Article": documents, "Topic": doc_topics})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                                 Article  Topic
0                                                             0
1       motorola cellular subscrib group   usa cc    ...      1
2                                                             0
3                                                             0
4                                                             0
...                                                  ...    ...
11309                                                         0
11310                                                         0
11311                                                         0
11312                                                         0
11313                                                         0

[11314 rows x 2 columns]



In [21]:
# Print the top terms for each topic
print("Top terms for each topic:")
for topic_id in range(lda_model.num_topics):
    topic_terms = [term for term, _ in lda_model.show_topic(topic_id, topn=5)]
    print(f"Topic {topic_id}: {', '.join(topic_terms)}")
print()

Top terms for each topic:
Topic 0: ax, x, max, file, use
Topic 1: ax, nt, one, use, x
Topic 2: ax, max, x, use, w
Topic 3: ax, nt, one, use, like



In [22]:
# Assuming you have an LDA model stored in the variable 'lda_model'
# and the number of topics is stored in 'num_topics'

print("Top Terms for Each Topic:")
for topic_idx in range(lda_model.num_topics):
    print(f"Topic {topic_idx}:")
    
    # Get the top terms and their weights for the current topic
    topic_terms = lda_model.get_topic_terms(topic_idx, topn=5)
    
    # Print the top terms and their weights
    for term, weight in topic_terms:
        word = lda_model.id2word[term]
        print(f"- {word} (weight: {weight:.3f})")
    
    print()

Top Terms for Each Topic:
Topic 0:
- ax (weight: 0.119)
- x (weight: 0.024)
- max (weight: 0.006)
- file (weight: 0.005)
- use (weight: 0.005)

Topic 1:
- ax (weight: 0.015)
- nt (weight: 0.009)
- one (weight: 0.006)
- use (weight: 0.006)
- x (weight: 0.006)

Topic 2:
- ax (weight: 0.215)
- max (weight: 0.018)
- x (weight: 0.007)
- use (weight: 0.005)
- w (weight: 0.004)

Topic 3:
- ax (weight: 0.036)
- nt (weight: 0.006)
- one (weight: 0.006)
- use (weight: 0.006)
- like (weight: 0.003)

