In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from spacy import load

import pandas as pd
import re

  from collections import Iterable
  from collections import Mapping


In [None]:
# Spacy is loaded for text processing tasks.
nlp = load('en')

In [3]:
# Load Corpus
corpus= pd.read_csv('C:/Users/admin/Desktop/topic modelling/20newsgroup.csv', encoding='utf-8')
data_set = corpus['Text']
print("Size of data set is %s." %str(len(data_set)))

Size of data set is 19955.


In [4]:
def _nlp_layer(x_series):
    print("Performing text normalization.")
    contractions_list = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i'am": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it had",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there had",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we had",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'alls": "you alls",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you had",
        "you'd've": "you would have",
        "you'll": "you you will",
        "you'll've": "you you will have",
        "you're": "you are",
        "you've": "you have"
    }

    c_re = re.compile('(%s)' % '|'.join(contractions_list.keys()))

    def expand_contractions(text, c_re_=c_re):
        def replace(match):
            return contractions_list[match.group(0)]

        return c_re_.sub(replace, text)

    x = [re.sub('[^0-9a-z\' ]+', ' ', item.lower()).split() for item in x_series]
    processed_x = []
    for eachQuery in x:
        query = []
        for eachToken in eachQuery:
            expanded_token = expand_contractions(eachToken)
            # Applied to normalize words like word's, mother's, 1980's etc.
            if '\'' in expanded_token:
                expanded_token = expanded_token.strip('\'s')
            # Replace alpha-numeric with special token 'Special_Tok'
            if expanded_token.isalnum() and not expanded_token.isalpha() and not expanded_token.isdigit():
                expanded_token = 'SpecialTok'
            # Replace all numerals with special token - 'NUM'
            if expanded_token.isdigit():
                expanded_token = 'NUM'
            query.extend(expanded_token.split())
        processed_x.append(query)
    return processed_x

In [5]:
data_set_processed = _nlp_layer(data_set)
corpus_for_tf = []
for each_document in data_set_processed:
    each_file = ""
    each_file += " ".join(each_token for each_token in each_document)
    corpus_for_tf.append(each_file)

Performing text normalization.


In [6]:
tf_vectorizer = CountVectorizer(ngram_range=(1,1),
                                max_features=100000,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(corpus_for_tf)

In [7]:
print("Fitting LDA model with term frequency features, n_samples=%d and n_features=%d." % (tf.shape[0], tf.shape[1]))
lda = LatentDirichletAllocation(n_components=20, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

Fitting LDA model with term frequency features, n_samples=19955 and n_features=100000.


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [8]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ",".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message,"\n")

In [9]:
print("Topics in LDA model and associated terms:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

Topics in LDA model and associated terms:
Topic #0: specialtok,ax,num,max,pl,db,cx,bhj,giz,wm,ah,sl,bh,mq,gk,chz,bxn,mv,hz,lk 

Topic #1: armenian,turkish,armenians,men,said,children,people,women,arab,muslims,muslim,armenia,sex,sexual,history,cancer,islam,today,party,years 

Topic #2: virginia,henry,myers,harvard,spencer,acc,hess,ryan,liver,erik,vote,murdoch,motto,waldrop,zoology,dzkriz,uio,galaxy,ndet,bob 

Topic #3: edu,atf,mil,umd,navy,ra,cs,ai,uga,michael,colostate,boeing,wam,maine,acns,racism,rpi,john,div,oakland 

Topic #4: specialtok,com,edu,writes,article,uiuc,netcom,cso,sin,andrew,news,cmu,hell,brian,mary,david,isc,opinions,ca,steve 

Topic #5: nra,tires,motorcycles,circuits,arf,tire,lehigh,seats,lunar,gretzky,nsmca,obo,kilometers,logo,tampa,moncton,transportation,adirondack,champs,kirlian 

Topic #6: num,specialtok,new,israel,christ,edu,university,april,guns,national,american,period,bike,st,war,team,ed,john,total,league 

Topic #7: people,god,government,say,think,writes,belie