## Topic Modelling for News Articles

In [1]:
#pip install gensim
#pip install pyLDAvis

In [2]:

#Initial Imports
import pandas as pd
import nltk as nltk
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

nltk.download('stopwords')

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aerku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [3]:
# Getting all the news articles 
articles = pd.read_csv('articles.csv', encoding="utf-8-sig")

In [4]:
articles.head()

Unnamed: 0,date,title,text,description
0,2020-12-31,$1B Intel investor says chipmaker must try to ...,An Intel investor with a billion dollar stake ...,An Intel investor with a billion dollar stake ...
1,2020-12-31,"The Morning After: Intel, AMD and Apple made 2...",A former Songkick employee shared his login in...,"Very, very soon, 2020 will be over. Did it tak..."
2,2020-12-31,Some Mac software has made it all the way from...,Mac developers are currently in the midst of a...,Mac developers are currently in the midst of a...
3,2020-12-31,Some Mac software has made it all the way from...,Mac developers are currently in the midst of a...,Mac developers are currently in the midst of a...
4,2020-12-31,"How to revive and restore M1 Macs, what the di...","Along with the shift to Apple Silicon, perform...","Along with the shift to Apple Silicon, perform..."


In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6639 entries, 0 to 6638
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         6639 non-null   object
 1   title        6639 non-null   object
 2   text         6639 non-null   object
 3   description  6618 non-null   object
dtypes: object(4)
memory usage: 207.6+ KB


In [6]:
# Cleaning the data by droping the null values
articles.dropna(inplace=True)

In [7]:
# Clean the data 

# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords
stop = stopwords.words('english')

# Expand the default stopwords list if necessary
stop.extend(["going", "still", "last", "think", "see", "way", "n't", "well", "know", "much", "’", "top", "app", "/li", "li", "...", "'s", "_", "--", "one", "said", "td", "gb", "tr"])


#Set tokenization function
def clean_text(text):
    
    words = word_tokenize(text)
    
    words = list(filter(lambda w:w.lower(), words))
 
    words = list(filter(lambda t:t not in punctuation, words))
    
    words = list(filter(lambda t: t.lower() not in stop, words))
    
    token = [lemmatizer.lemmatize(word) for word in words]
    
    return token

In [8]:
# Applying the clean text function to the text
data_words = articles['text'].apply(clean_text)

data_words[0][:20]

['Intel',
 'investor',
 'billion',
 'dollar',
 'stake',
 'chipmaker',
 'say',
 'need',
 'make',
 'drastic',
 'change',
 'address',
 'changed',
 'fortune',
 'including',
 'attempting',
 'win',
 'back',
 'Apple',
 'client']

In [9]:
# Defining Gensim word simple preprocess 

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)
#data_words = gen_words(dwords)


In [10]:

id2word = corpora.Dictionary(data_words)


corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print(corpus[0][0:20])

word = id2word[[0][:1][0]]
print(word)


#from gensim.models import TfidfModel
#tfidf = TfidfModel(corpus, id2word=id2word)


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 6), (15, 2), (16, 1), (17, 2), (18, 2), (19, 2)]
10-figure


In [11]:
# LDA Model

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                           id2word = id2word, 
                                           num_topics= 7, 
                                           random_state = 100,
                                           update_every=1, 
                                           alpha = 'auto')

In [12]:
#Topics
print(lda_model.print_topics())


[(0, '0.042*"Apple" + 0.025*"Qualcomm" + 0.014*"iPhone" + 0.014*"Intel" + 0.014*"5G" + 0.013*"modem" + 0.013*"chip" + 0.009*"company" + 0.008*"year" + 0.007*"patent"'), (1, '0.031*"Apple" + 0.012*"new" + 0.009*"Mac" + 0.006*"year" + 0.006*"iPhone" + 0.005*"iPad" + 0.005*"Pro" + 0.005*"``" + 0.005*"\'\'" + 0.005*"also"'), (2, '0.025*"Visa" + 0.012*"/td" + 0.009*"Mastercard" + 0.009*"Apple" + 0.007*"Pro" + 0.006*"year" + 0.006*"Mac" + 0.005*"new" + 0.004*"iMac" + 0.004*"Intel"'), (3, '0.036*"Amazon" + 0.024*"price" + 0.023*"list" + 0.018*"deal" + 0.017*"normally" + 0.011*"Walmart" + 0.008*"code" + 0.008*"RAM" + 0.008*"Apple" + 0.007*"use"'), (4, '0.008*"company" + 0.006*"Apple" + 0.006*"Huawei" + 0.006*"like" + 0.005*"also" + 0.005*"device" + 0.005*"Google" + 0.005*"year" + 0.004*"new" + 0.004*"laptop"'), (5, '0.018*"Pro" + 0.015*"MacBook" + 0.014*"Apple" + 0.012*"laptop" + 0.011*"Intel" + 0.009*"model" + 0.008*"new" + 0.008*"Core" + 0.007*"processor" + 0.006*"display"'), (6, '0.012*"com

# Visualization of LDA model

In [21]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis