# Topic Modelling with Latend Dirichlet Allocation

### Step 1. Load the data

In [5]:
# Loading the dataset from a csv
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[:300000][['headline_text']];
data_text['index'] = data_text.index

documents = data_text

# Total number of documents
print(f'The total number of documents is: {len(documents)}')



  data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);


The total number of documents is: 300000


In [6]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [10]:
!git push

To https://github.com/CalesSla/NLP_Topic_Modelling_LDA.git
   7fa65f6..bd70809  master -> master


### Step 2. Imports and data preprocessing

In [14]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
nltk.download('wordnet')
np.random.seed(400)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
# Lemmatizer example
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

go


In [17]:
# Stemmer example
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [22]:
# Stemming and Lematization on the entire dataset

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [21]:
# Document example after preprocessing

document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['rain', 'helps', 'dampen', 'bushfires']


Tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [24]:
# Preprocess headlines
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:3]

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
Name: headline_text, dtype: object

### Step 3. Bag of Words approach on the dataset

In [25]:
# Create a dictionary of word counts
dictionary = gensim.corpora.Dictionary(processed_docs)

In [27]:
# Remove too rare and too common words from the dictionary
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [31]:
# Initialize the bag of words corpus using the word counts
bow_corpus = [dictionary.doc2bow(document) for document in processed_docs]
bow_corpus[document_num]

[(76, 1), (113, 1), (482, 1), (4016, 1)]