In [None]:
import numpy as np
import wordninja
import pandas as pd
import re
import string
import nltk
import wordninja
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import csv

## Preprocessing

In [None]:
df= pd.read_excel(r'xxxx', index= 0, encoding= 'UTF-8')

In [None]:
#checks whether this is consistent with number of articles for each year
df.shape

In [None]:
df['Hlead']= df['Hlead'].str.lower()
df['Hlead']= df['Hlead'].str.replace(r'\d+', '')
df['Hlead']= df['Hlead'].str.replace(r'block-time', '')
df['Hlead']= df['Hlead'].str.replace(r'published-time', '')
df['Hlead']= df['Hlead'].str.replace(r'gmt', '')
df['Hlead']= df['Hlead'].str.replace(r'bst', '')
df['Hlead']= df['Hlead'].str.replace(r'aest', '')
df['Hlead']=df['Hlead'].str.replace(r'summary', '')
df['Hlead']= df['Hlead'].str.replace(r'pm', '')

for sentences in df['Hlead']:
    wordninja.split(sentences)
    
def remove_punctuation(text):
    text= ''.join([i for i in text if i not in string.punctuation])
    return text
df['Hlead'] = df['Hlead'].apply(remove_punctuation)

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df['Hlead']= df['Hlead'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
def remove_stops(text):
    text= [w for w in text if w not in stopwords.words('english')]
    return text

df['Hlead'] = df['Hlead'].apply(lambda x: remove_stops(x))
print('done')

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    text= [lemmatizer.lemmatize(i) for i in text]
    return text

df['Hlead']= df['Hlead'].apply(lemmatize_text)

In [None]:
df.to_excel(r'xxxx', index=False)

## Topic Modeling

In [None]:
df= pd.read_excel(r'xxxx')

In [None]:
df.drop(['Unnamed: 0'], axis=1)
df["Hlead"] = df["Hlead"].map(lambda x: x.split(' ')) # didn't need this once I had switched around stopwords and lemmatizing

In [None]:
dictionary = corpora.Dictionary(df['Hlead']) #Creates a dictionary which maps strings to integers. This is same as doc-term matrix
corpus= [dictionary.doc2bow(text) for text in df['Hlead']]

In [None]:
print(corpus[0])

In [None]:
#corpus= vectorised corpus
#num_topics= self-explanatory
#iterations= the number of passes your model makes over the dataset
#alpha= document-topic density. The higher the alpha, the more topics within a doc, and lower= vice versa
#id2word= df['hlead'] converted into a gensim dictionary which maps strings to integers. 
#random_state= either a randomState object o a seed to generate one. 

In [None]:
## Random state is consistent
#Period 1, K= 22, iter= 1500, coherence= 0.39 
#Period 2, K= 20 , iter= 1500, coherence= 0.46
#Period 3, K= 28??, iter= 1500, coherence= 0.44

k= 20 #num of topics
lda= LdaModel(corpus, num_topics=k, iterations= 1500, alpha= 'auto', id2word= dictionary, random_state=100) 

In [None]:
# Documents on the rows and topic proportions on the columns
theta = lda.get_document_topics(corpus, minimum_probability = 0)

# Topics on the rows and words on the columns
phi = lda.get_topics()

In [None]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda, texts=df['Hlead'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
#start= value to start from, limit= where to go up to, step= increments in
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=df['Hlead'], start=15, limit=75, step=5)
# Show graph
import matplotlib.pyplot as plt
limit=75; start=15; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Topic interpretation (analysis)


- Start by analysing the keywords associated with each topic
- Proceed to examine the assignment frequencies (i.e. examine how much each keyword occurs in a topic)
- Analysing the semantic coherence (i.e. how much the topics make sense?)

## Keywords

We start the process of topic interpretation and validation by examining the topic **keywords**. These "keywords" are probable tokens under the model -- i.e., the tokens most often assigned to a particular topic. We can either analyse each topic individually using the `show_topic()` or all at the same time using `show_topics()` method for LDA objects:

In [None]:
keys = lda.show_topics(num_topics=k, num_words =20 )
for key in keys:
    print(key)

## Nice visualisation

In [None]:
from prettytable import PrettyTable

def view_keywords(model, num_topics, num_words = 20, prettyprint = True):
# Return keywords from gensim
    keywords = model.show_topics(num_topics = num_topics, 
                               num_words = num_words, 
                               formatted=False)

    # Reformat keyword results for easy viewing
    output = []
    for row in keywords:
        tokens = ' '.join([token[0] for token in row[1]])
        output.append([row[0], tokens])
    
    # Print a nicely formatted table
    if prettyprint:
        tbl = PrettyTable()
                
        # Column labels
        tbl.field_names = ["Topic ID", "Keywords"]
        
        # Populate table
        for row in output:
            tbl.add_row(row)
        
        # Output formatted table
        tbl.align = "l"
        print(tbl)
    
    return output

In [None]:
#outlined box does have words ranked in order of prevalence
keywords = view_keywords(lda, lda.num_topics, num_words = 20)

## Determining what a topic is about
This section extracts the most probable documents for each topic

In [None]:
def top_documents(content, topic_id, theta, n = 10):
  
    x = theta[topic_id,:].todense()
    idx = np.argpartition(x[0,:], -n)[0,-n:]
    
    # Get sorted IDs
    idx_sorted = idx[0, np.argsort(-x[0,idx])].tolist()[0]
    
    # Find and return the top documents
    return [row for i,row in enumerate(content) if i in set(idx_sorted)]

In [None]:
print('Extracting the full theta matrix. This can be slow!')
theta = gensim.matutils.corpus2csc(lda[corpus])

In [None]:
top_docs = top_documents(df['Hlead'],0, theta) #this calls your function. The number refers to the topic 
print(top_docs[2]) #should print off the top document for each topic

## Topic assignment frequency

Next, I like to extract on overall measure of the **importance** of a topic to a corpus. Here, we can define a function to extract the topic assignment frequencies:

In [None]:
# Function to return the topic frequency
def topic_frequency(model, corpus, proportion = True, LOG_EVERY_N = 1000):
    ''' Takes a gensim model object and a corpus object
        and returns the number of words assigned to each
        topic. '''
    
    # Extract topic distributions
    theta = model.get_document_topics(corpus, minimum_probability = 0)
    
    # Extract number of words in each document
    n = [sum([row[1] for row in doc]) for doc in corpus]
    
    # Get topic assignments
    print('Extracting topic assignments for each token...')
    
    counts = []
    for i,row in enumerate(theta):
        # Extract topic assignemnt counts
        counts.append([round(el[1]*n[i]) for el in row])
        
        # Log progress
        if (i % LOG_EVERY_N) == 0:
            print('Finished processing %s documents' % i)
            
    # Convert to a numpy array
    counts_matrix = np.array(counts)
    
    # Sum down topics to get assignment totals
    assignments = np.sum(counts_matrix, axis = 0)
    
    if proportion:
        res = assignments/np.sum(assignments)
    else:
        res = assignments
    
    return res.tolist()

We can now get the assignment proportions by calling the function. 

In [None]:
# Extract topic assignments
assignments = topic_frequency(lda, corpus)

Below loop prints out the assignment proportions to get a sense of topic prevalence 

In [None]:
for i, row in enumerate(assignments):
    print('Topic %s = %s' % (i, round(row, 3)))

## Topic "quality"

### Semantic coherence

In [None]:
# Pull out and tokenize the topic keywords
topics = [row[1].split(' ') for row in keywords]

# Estimate coherence. The 'u_mass' and 'c_v'
# methods are good to try.
co = CoherenceModel(topics=topics,
                    texts=df['Hlead'],
                    dictionary=dictionary,
                    coherence='c_v')

# Extract semantic coherence for each topic
semantic = co.get_coherence_per_topic()

In [None]:
for i, row in enumerate(semantic):
    print('Topic %s = %s' % (i, row))

In [None]:
print('Model coherence = %s' % np.mean(semantic))

## Writing the results to disk

In [None]:
topics_to_write = []
for i,topic in enumerate(topics):
    topics_to_write.append([i, assignments[i], semantic[i], ex[i], ' '.join(topic)])

# Insert labels
topics_to_write.insert(0, ['id', 'assignment_prop', 'coherence', 'exclusivity', 'keywords'])

# Write
with open('xxxx', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerows(topics_to_write)