# Task 1: Variational autoencoder based Topic Model 

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from spacy.lang.en import English
nlp = English()
import torch
from torch.utils.data import DataLoader
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## TODO: Import the data
Get ten year data from the downloaded file. And display top 5 rows.

In [3]:
print('reading raw data...')
! pip install -q kaggle
from google.colab import files
files.upload() #upload kaggle api token
! mkdir ~/.kaggle 
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
#! kaggle datasets list
! kaggle datasets download -d rowhitswami/nips-papers-1987-2019-updated

reading raw data...


Saving kaggle.json to kaggle.json
Downloading nips-papers-1987-2019-updated.zip to /content
 98% 104M/106M [00:01<00:00, 103MB/s]  
100% 106M/106M [00:01<00:00, 82.1MB/s]


In [5]:
!unzip /content/nips-papers-1987-2019-updated.zip -d train

Archive:  /content/nips-papers-1987-2019-updated.zip
  inflating: train/authors.csv       
  inflating: train/papers.csv        


In [6]:

papers = pd.read_csv(r'/content/train/papers.csv')

#taking 10 years of data from the csv
df = papers.query('year >= 1989 and year <= 1998')
df.reset_index(drop=True,inplace=True)
data = df['full_text']
data.dropna(inplace=True)
df.shape 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(result)


(1418, 5)

## TODO: Preprocessing
Apply preprocessing step (such as tokenization, remove stopwords, punctuations, words with word length less than three, min\_df/max\_df in [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to remove term with lower/higher document frequency)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import string
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re

tokens = []

for sent in data:

  # lower case
  sent = sent.lower() 

  #remove whitespaces
  sent_w = re.sub(r'\b\w{1,3}\b', '', sent)

  #remove punctuations
  sent_p = "".join([char for char in sent_w if char not in string.punctuation])

  #remove numbers
  sent_n =  re.sub(r'\d+', '', sent_p)

  #tokenisation
  words = word_tokenize(sent_n)

  #remove short words
  sent_b= ' '.join(word for word in words if len(word)>3)
  tokens.append(sent_b)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
#remove stopwords and remove terms with lower/higher frequency
custom_vec = CountVectorizer( stop_words = 'english',min_df = 2,max_df = 25)
cwm = custom_vec.fit_transform(tokens)
print("Shape of Tokenizer")
print(cwm.shape)


Shape of Tokenizer
(1418, 20092)


## TODO: Split data
Separate your input data into training, validation, and testing subsets

In [9]:
vocab_size = len(custom_vec.vocabulary_)
vocab= cwm.toarray() 



In [10]:
from sklearn.model_selection import train_test_split
train_dl, test_dl = train_test_split(vocab, test_size=0.2)

train_dl,val_dl  = train_test_split(train_dl, test_size=0.25)


In [11]:
train_dl[0].shape #first batch shape is 1d but required 2d or 3d so need to reshape the array
  

(20092,)

In [12]:
#code to reshape
from pandas.core.internals.managers import new_block
from functools import reduce

def func(x,n):
    for i in range(2,n):
      if i*x == n:
        return i
        
def update(data):
  n =  len(data)

  factor = set(reduce(list.__add__,([i, n//i] for i in range(1, int(n**0.5) + 1) if n % i == 0)))

  factor = list(factor)
  factor.sort()

  len_f = int(len(factor)/2)

  new = func(factor[len_f],n)
  return new, factor[len_f]

new, new1 = update(test_dl)
test_new = test_dl.reshape(new, new1, len(test_dl[0]))

new, new1 = update(train_dl)
train_new = train_dl.reshape(new, new1, len(train_dl[0]))

new, new1 = update(val_dl)
val_new = val_dl.reshape(new, new1, len(val_dl[0]))

print(test_new.shape)
print(train_new.shape)
print(val_new.shape)

(4, 71, 20092)
(25, 34, 20092)
(4, 71, 20092)


## Gaussian VAE model

In [13]:
import torch
import torch.nn as nn
from torch.distributions import LogNormal, Dirichlet, Gamma, Laplace
from torch.distributions import kl_divergence

class EncoderModule(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.linear_layer_one = nn.Linear(vocab_size, hidden_size[0])
        self.linear_layer_two = nn.Linear(hidden_size[0], hidden_size[1])
        self.linear_layer_three = nn.Linear(hidden_size[1], hidden_size[2])
        
    def forward(self, inputs):
        activation = nn.LeakyReLU()
    
        hidden_layer_one = activation(self.linear_layer_one(inputs))
        hidden_layer_two = self.dropout(activation(self.linear_layer_two(hidden_layer_one)))
        hidden_layer_three = self.dropout(activation(self.linear_layer_three(hidden_layer_two)))
        return hidden_layer_three


class DecoderModule(nn.Module):
    def __init__(self, vocab_size, num_topics, dropout):
        super().__init__()
        self.topics_to_doc = nn.Linear(num_topics, vocab_size)
        self.batch_normalization = nn.BatchNorm1d(vocab_size, affine=False)
        
    def forward(self, inputs):
        log_softmax = nn.LogSoftmax(dim = 1)
        return log_softmax(self.batch_normalization(self.topics_to_doc(inputs)))


class EncoderToLogNormal(nn.Module):
    def __init__(self, hidden_size, num_topics):
        super().__init__()
        self.linear_mean = nn.Linear(hidden_size[2], num_topics)
        self.linear_var = nn.Linear(hidden_size[2], num_topics)
        self.batch_norm_mean = nn.BatchNorm1d(num_topics, affine=False)
        self.batch_norm_var = nn.BatchNorm1d(num_topics, affine=False)
    
    def forward(self, hidden):
        mean = self.batch_norm_mean(self.linear_mean(hidden))
        var = 0.5 * self.batch_norm_var(self.linear_var(hidden))
        dist = LogNormal(mean, var.exp())
        return dist
        

class VAE(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_topics, dropout, model_type, beta):
        super().__init__()
        self.encoder = EncoderModule(vocab_size, hidden_size, dropout)
        if model_type == 1:
            self.encoder_to_dist = EncoderToLogNormal(hidden_size, num_topics)
        self.decoder = DecoderModule(vocab_size, num_topics, dropout)
        self.beta = beta
        
    def forward(self, inputs):
        
        encoder_output = self.encoder(inputs)
        dist = self.encoder_to_dist(encoder_output)
        if self.training:
            dist_to_decoder = dist.rsample().to(inputs.device)
        else:
            dist_to_decoder = dist.mean.to(inputs.device)
        softmax = nn.Softmax(dim = 1)
        dist_to_decoder = softmax(dist_to_decoder)
        reconstructed_documents = self.decoder(dist_to_decoder)
        return reconstructed_documents, dist
    
    def loss(self, reconstructed, original, posterior): 
        if isinstance(posterior, LogNormal):
            loc = torch.zeros_like(posterior.loc)
            scale = torch.ones_like(posterior.scale)        
            prior = LogNormal(loc, scale)

        NLL = - torch.sum(reconstructed*original)
        KLD = torch.sum(kl_divergence(posterior, prior).to(reconstructed.device))
        loss_for_training = NLL + self.beta * KLD
        return NLL, KLD, loss_for_training
        

## Training Gaussian VAE 

In [14]:

def train_one_epoch(train_dl, model, optim, device):
    model.train()
    epoch_total_loss, epoch_nll_loss, epoch_kld_loss = [], [], []
    for batch in train_dl:
        batch_total_loss, batch_nll_loss, batch_kld_loss = train_one_batch(batch, model, optim, device)
        epoch_total_loss.append(batch_total_loss), epoch_nll_loss.append(batch_nll_loss), epoch_kld_loss.append(batch_kld_loss)
    loss ={'total_loss': torch.mean(torch.Tensor(epoch_total_loss)), 'nll_loss': torch.mean(torch.Tensor(epoch_nll_loss)), 'kld_loss': torch.mean(torch.Tensor(epoch_kld_loss))}
    return loss

def train_one_batch(batch, model, optim, device):
    docs = torch.from_numpy(batch.astype(np.float32)).to(torch.device(device))
    optim.zero_grad()
    out, posterior = model(docs)
    nll, kld, loss_for_training = model.loss(out, docs, posterior)
    loss = nll + kld
    loss_for_training.backward()
    optim.step()
    return loss.item(), nll.item(), kld.item()

def validate_one_epoch(val_dl, model, device):
    model.eval()
    epoch_total_loss, epoch_nll_loss, epoch_kld_loss = [], [], []
    for batch in val_dl:
        batch_total_loss, batch_nll_loss, batch_kld_loss = validate_one_batch(batch, model, device)
        epoch_total_loss.append(batch_total_loss), epoch_nll_loss.append(batch_nll_loss), epoch_kld_loss.append(batch_kld_loss)
    loss ={'total_loss': torch.mean(torch.Tensor(epoch_total_loss)), 'nll_loss': torch.mean(torch.Tensor(epoch_nll_loss)), 'kld_loss': torch.mean(torch.Tensor(epoch_kld_loss))}
    return loss

def validate_one_batch(batch, model, device):
    docs = torch.from_numpy(batch.astype(np.float32)).to(torch.device(device))
    out, posterior = model(docs)
    nll, kld, _ = model.loss(out, docs, posterior)
    loss = nll + kld
    return loss.item(), nll.item(), kld.item()

def fit(epochs, train_dl, val_dl, model, optim, device, path, writer):
    history = []
    for epoch in range(epochs):
        epoch_train_loss = train_one_epoch(train_dl, model, optim, device)
        epoch_validation_loss = validate_one_epoch(val_dl, model, device)
        writer.add_scalar("Loss/train", epoch_train_loss['total_loss'], epoch)
        writer.add_scalar("Loss/eval", epoch_validation_loss['total_loss'], epoch)
        log = {
                'epoch': epoch + 1,
                'train_loss': epoch_train_loss['total_loss'],
                'train_loss_nll': epoch_train_loss['nll_loss'],
                'train_loss_kld': epoch_train_loss['kld_loss'],
            }
        history.append(log)
        print(log)
    beta = model.decoder.topics_to_doc.weight.cpu().detach().T
    return beta, history

torch.manual_seed(3)
hidden_size = [512, 256, 128]
num_topics = 25
dropout = 0.2
epochs = 200

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Declaring model and optimizer
model = VAE(vocab_size, hidden_size, num_topics, dropout, 1, 3)
model = model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

from torch.utils.tensorboard import SummaryWriter
path = '/content'
writer = SummaryWriter()

# Run, trainings
beta, history = fit(epochs, train_new, val_new, model, optim,device,path,writer)

{'epoch': 1, 'train_loss': tensor(53262.6836), 'train_loss_nll': tensor(52583.9648), 'train_loss_kld': tensor(678.7206)}
{'epoch': 2, 'train_loss': tensor(53106.), 'train_loss_nll': tensor(52474.7461), 'train_loss_kld': tensor(631.2589)}
{'epoch': 3, 'train_loss': tensor(52963.7969), 'train_loss_nll': tensor(52340.3867), 'train_loss_kld': tensor(623.4064)}
{'epoch': 4, 'train_loss': tensor(52784.6250), 'train_loss_nll': tensor(52173.8750), 'train_loss_kld': tensor(610.7527)}
{'epoch': 5, 'train_loss': tensor(52721.7734), 'train_loss_nll': tensor(52117.7266), 'train_loss_kld': tensor(604.0503)}
{'epoch': 6, 'train_loss': tensor(52680.6562), 'train_loss_nll': tensor(52082.1641), 'train_loss_kld': tensor(598.4908)}
{'epoch': 7, 'train_loss': tensor(52464.7617), 'train_loss_nll': tensor(51869.8633), 'train_loss_kld': tensor(594.8963)}
{'epoch': 8, 'train_loss': tensor(52383.6367), 'train_loss_nll': tensor(51784.2109), 'train_loss_kld': tensor(599.4254)}
{'epoch': 9, 'train_loss': tensor(52

## TODO: Implement Dirichlet VAE
Use Gaussian VAE model as reference which is given above and implement Dirichlet VAE by using Dirichlet distribution as a prior on the latent variables.

In [15]:
import torch
import torch.nn as nn
from torch.distributions import LogNormal, Dirichlet, Gamma, Laplace
from torch.distributions import kl_divergence
import tensorflow as tf

def prior(K, alpha):
    """
    Prior for the model.
    :K: number of categories
    :alpha: Hyper param of Dir
    :return: mean and variance tensors
    """
    # Approximate to normal distribution using Laplace approximation
    a = torch.Tensor(1, K).float().fill_(alpha)
    mean = a.log().t() - a.log().mean(1)
    var = ((1 - 2.0 / K) * a.reciprocal()).t() + (1.0 / K ** 2) * a.reciprocal().sum(1)
    return mean.t(), var.t() # Parameters of prior distribution after approximation


class EncoderModule(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.linear_layer_one = nn.Linear(vocab_size, hidden_size[0])
        self.linear_layer_two = nn.Linear(hidden_size[0], hidden_size[1])
        self.linear_layer_three = nn.Linear(hidden_size[1], hidden_size[2])
        
    def forward(self, inputs):
        activation = nn.LeakyReLU()
    
        hidden_layer_one = activation(self.linear_layer_one(inputs))
        hidden_layer_two = self.dropout(activation(self.linear_layer_two(hidden_layer_one)))
        hidden_layer_three = self.dropout(activation(self.linear_layer_three(hidden_layer_two)))
        return hidden_layer_three


class DecoderModule(nn.Module):
    def __init__(self, vocab_size, num_topics, dropout):
        super().__init__()
        self.topics_to_doc = nn.Linear(num_topics, vocab_size)
        self.batch_normalization = nn.BatchNorm1d(vocab_size, affine=False)
        
    def forward(self, inputs):
        log_softmax = nn.LogSoftmax(dim = 1)
        return log_softmax(self.batch_normalization(self.topics_to_doc(inputs)))


class EncoderToLogNormal(nn.Module):
    def __init__(self, hidden_size, num_topics,alpha):
        super().__init__()
        self.linear_mean = nn.Linear(hidden_size[2], num_topics)
        self.linear_var = nn.Linear(hidden_size[2], num_topics)
        self.batch_norm_mean = nn.BatchNorm1d(num_topics, affine=False)
        self.batch_norm_var = nn.BatchNorm1d(num_topics, affine=False)
    
    def forward(self, hidden):
        self.prior_mean, self.prior_var = map(nn.Parameter, prior(num_topics, alpha))
        #print('new aplpha' +alpha+) # 0.3 is a hyper param of Dirichlet distribution
        self.prior_logvar = nn.Parameter(self.prior_var.log())
        self.prior_mean.requires_grad = False
        self.prior_var.requires_grad = False
        self.prior_logvar.requires_grad = False
        dist = LogNormal(self.prior_mean, self.prior_var.exp())
        return dist

class VAE1(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_topics, dropout, model_type, beta,alpha):
        super().__init__()
        self.encoder = EncoderModule(vocab_size, hidden_size, dropout)
        if model_type == 1:
            self.encoder_to_dist = EncoderToLogNormal(hidden_size, num_topics,alpha)
        self.decoder = DecoderModule(vocab_size, num_topics, dropout)
        self.beta = beta
        
    def forward(self, inputs):
        
        encoder_output = self.encoder(inputs)
        dist = self.encoder_to_dist(encoder_output)
        if self.training:
            dist_to_decoder = dist.rsample().to(inputs.device)
        else:
            dist_to_decoder = dist.mean.to(inputs.device)
        softmax = nn.Softmax(dim = 1)
        dist_to_decoder = softmax(dist_to_decoder)
        reconstructed_documents = self.decoder(dist_to_decoder)
        return reconstructed_documents, dist
    
    def loss(self, reconstructed, original, posterior): 
        if isinstance(posterior, LogNormal):
            loc = torch.zeros_like(posterior.loc)
            scale = torch.ones_like(posterior.scale)        
            prior = LogNormal(loc, scale)

        NLL = - torch.sum(reconstructed*original)
        KLD = torch.sum(kl_divergence(posterior, prior).to(reconstructed.device))
        loss_for_training = NLL + self.beta * KLD
        return NLL, KLD, loss_for_training
        

## Train the Dirichlet VAE Model
Use Training Gaussian VAE as a reference code and train Dirichlet VAE Model.

In [16]:

def train_one_epoch(train_dl, model, optim, device):
    model.train()
    epoch_total_loss, epoch_nll_loss, epoch_kld_loss = [], [], []
    for batch in train_dl:
        batch_total_loss, batch_nll_loss, batch_kld_loss = train_one_batch(batch, model, optim, device)
        epoch_total_loss.append(batch_total_loss), epoch_nll_loss.append(batch_nll_loss), epoch_kld_loss.append(batch_kld_loss)
    loss ={'total_loss': torch.mean(torch.Tensor(epoch_total_loss)), 'nll_loss': torch.mean(torch.Tensor(epoch_nll_loss)), 'kld_loss': torch.mean(torch.Tensor(epoch_kld_loss))}
    return loss

def train_one_batch(batch, model, optim, device):
    docs = torch.from_numpy(batch.astype(np.float32)).to(torch.device(device))
    optim.zero_grad()
    out, posterior = model(docs)
    nll, kld, loss_for_training = model.loss(out, docs, posterior)
    loss = nll + kld
    loss_for_training.backward()
    optim.step()
    return loss.item(), nll.item(), kld.item()

def validate_one_epoch(val_dl, model, device):
    model.eval()
    epoch_total_loss, epoch_nll_loss, epoch_kld_loss = [], [], []
    for batch in val_dl:
        batch_total_loss, batch_nll_loss, batch_kld_loss = validate_one_batch(batch, model, device)
        epoch_total_loss.append(batch_total_loss), epoch_nll_loss.append(batch_nll_loss), epoch_kld_loss.append(batch_kld_loss)
    loss ={'total_loss': torch.mean(torch.Tensor(epoch_total_loss)), 'nll_loss': torch.mean(torch.Tensor(epoch_nll_loss)), 'kld_loss': torch.mean(torch.Tensor(epoch_kld_loss))}
    return loss

def validate_one_batch(batch, model, device):
    docs = torch.from_numpy(batch.astype(np.float32)).to(torch.device(device))
    out, posterior = model(docs)
    nll, kld, _ = model.loss(out, docs, posterior)
    loss = nll + kld
    return loss.item(), nll.item(), kld.item()

def fit(epochs, train_dl, val_dl, model, optim, device, path, writer):
    history = []
    for epoch in range(epochs):
        epoch_train_loss = train_one_epoch(train_dl, model, optim, device)
        epoch_validation_loss = validate_one_epoch(val_dl, model, device)
        writer.add_scalar("Loss/train", epoch_train_loss['total_loss'], epoch)
        writer.add_scalar("Loss/eval", epoch_validation_loss['total_loss'], epoch)
        log = {
                'epoch': epoch + 1,
                'train_loss': epoch_train_loss['total_loss'],
                'train_loss_nll': epoch_train_loss['nll_loss'],
                'train_loss_kld': epoch_train_loss['kld_loss'],
            }
        history.append(log)
        print(log)
    beta = model.decoder.topics_to_doc.weight.cpu().detach().T
    return beta, history

torch.manual_seed(3)
hidden_size = [512, 256, 128]
num_topics = 25
dropout = 0.2
epochs = 200

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def new_alphas(alpha):
  # Declaring model and optimizer
  model1 = VAE1(vocab_size, hidden_size, num_topics, dropout, 1, 3,alpha)
  model1 = model.to(device)
  optim = torch.optim.Adam(model.parameters(), lr=1e-3)

  from torch.utils.tensorboard import SummaryWriter
  path = '/content'
  writer = SummaryWriter()

  # Run, trainings
  beta, history = fit(epochs, train_new, val_new, model1, optim,device,path,writer)
  return beta

## TODO: Get 25 topics using Dirichlet VAE model with five different values for $\alpha\in\{0.1,0.5,1.0,2.0,10.0\}$.

In [17]:
v = custom_vec.vocabulary_

alpha = [0.1,0.5,1.0,2.0,10.0]

topic_list = [] 
for i in alpha:
  wfile = open("/content/sample_data/new_topics.txt", "w")

  wfile.write("-----------Top 10 words for New Alpha----------\n")

  new_beta = new_alphas(i)

  def get_Topics_DVAE(beta, vocab, path,num_topics,wfile):
    for topic in new_beta:
      wfile.write("-----------Topic-----------\n")
      indices = []
      lis = []
      s = topic.sort()
      indices = s.indices[-10:]
      indices = indices.tolist()

      for i in indices:  
        lis.append( [k for k, val in v.items() if val == i])
  
      wfile.write(str(lis)+"\n\n\n")
      topic_list.append(lis)

  get_Topics_DVAE(new_beta, vocab,path, num_topics,wfile)
  wfile.close()

{'epoch': 1, 'train_loss': tensor(44990.3203), 'train_loss_nll': tensor(44423.8594), 'train_loss_kld': tensor(566.4604)}
{'epoch': 2, 'train_loss': tensor(44904.6836), 'train_loss_nll': tensor(44344.4492), 'train_loss_kld': tensor(560.2332)}
{'epoch': 3, 'train_loss': tensor(45088.6289), 'train_loss_nll': tensor(44527.6641), 'train_loss_kld': tensor(560.9651)}
{'epoch': 4, 'train_loss': tensor(44771.1641), 'train_loss_nll': tensor(44209.0781), 'train_loss_kld': tensor(562.0801)}
{'epoch': 5, 'train_loss': tensor(44819.5195), 'train_loss_nll': tensor(44261.0547), 'train_loss_kld': tensor(558.4646)}
{'epoch': 6, 'train_loss': tensor(44934.2344), 'train_loss_nll': tensor(44379.0195), 'train_loss_kld': tensor(555.2210)}
{'epoch': 7, 'train_loss': tensor(44775.6484), 'train_loss_nll': tensor(44223.8633), 'train_loss_kld': tensor(551.7836)}
{'epoch': 8, 'train_loss': tensor(44684.0938), 'train_loss_nll': tensor(44131.1445), 'train_loss_kld': tensor(552.9487)}
{'epoch': 9, 'train_loss': tenso

# Task 2: Evaluation Measure

## TODO: Get Reference Corpus
The preprocessed data will be used as Reference Corpus to calculate topic coherence, diversity.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import string
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
# TODO: Import the data and get ten year data from the file.

print('reading raw data...')

data = pd.read_csv(r'/content/train/papers.csv')

#taking 10 years of data from the csv
df = data.query('year >= 1989 and year <= 1998')
df.reset_index(drop=True,inplace=True)
data = df['full_text']
data.dropna(inplace=True)
df.shape 
# TODO: Use the same preprocessing step that used for training the model 
#       (such as tokenization, remove stopwords, punctuations, words with word length less than three,
#       min_df/max_df in CountVectorizer to remove term with lower/higher document frequency)

cvectorizer = CountVectorizer(stop_words = 'english',min_df = 2,max_df = 25) 


tokens = []

for sent in data:

  # lower case
  sent = sent.lower() 

  #remove whitespaces
  sent_w = re.sub(r'\b\w{1,3}\b', '', sent)

  #remove punctuations
  sent_p = "".join([char for char in sent_w if char not in string.punctuation])

  #remove numbers
  sent_n =  re.sub(r'\d+', '', sent_p)

  #tokenisation
  words = word_tokenize(sent_n)

  #remove short words
  sent_b= ' '.join(word for word in words if len(word)>3)
  tokens.append(sent_b)

# TODO: Use fit_transform from CountVectorizer to get document-term matrix.
cvz = cvectorizer.fit_transform(tokens)
print("Shape of Tokenizer")
print(cwm.shape)

# Save it to csv file. This csv file will be used as refernece corpus for evaluation.


## TODO: Get topic coherence 
Definition for pointwise mutual information is:
    $$ p m i \equiv \log \left[\frac{p(x, y)}{p(x) p(y)}\right] $$
Whereas for normalized pointwise mutual information is:
    $$ n p m i \equiv \frac{p m i}{-\log p(x, y)} $$
NPMI must be found within a certain window size. Using a window size of 10, the window would move over the documents 10 word tokens at a time. The window size can be used here as a document. In other words, you need to find the word count in each document.

In [None]:
from collections import Counter
from math import log
# Calculate the coherence using the NPMI equation

def topic_coherence(ref_data, beta, n_top_words=10):
    """Returns topic coherence.
    """
    vocab = construct_vocab(data)

    for (w1, w2, pmi) in calculate_pmi(vocab):
        print("{}_{}: {:.3f}".format(w1, w2, pmi))
    
    
    return (pmi - log(joint_prob)) / -log(joint_prob)
    

def gen_bigrams(data, window_size=5):
    for idx in range(len(data)):
        window = data[idx: idx + window_size]
       
        if len(window) < 2:
            break
            
        w = window[0]
        for next_word in window[1:]:
            yield (w, next_word)
            

def construct_vocab(data):
    vocab = Counter()
    
    for (w1, w2) in gen_bigrams(data, window_size=10): # count 1gram & 2gram
        vocab.update([w1, w2, (w1, w2)])
    return vocab
        

def calculate_pmi(vocab):
    det = sum(vocab.values())
    
    for (w1, w2) in filter(lambda el: isinstance(el, tuple), vocab):
        p_a, p_b = float(vocab[w1]), float(vocab[w2])
        p_ab = float(vocab[(w1, w2)])
        
        yield (w1, w2, log((det * p_ab) / (p_a * p_b), 2))
    


## TODO: Get topic diversity
Diversity is how many unique words are in the top 10 words among all topics 

In [None]:
def _diversity(beta, num_topics, num_words):
    """Returns topic diversity.
    """
    n_unique = 
    diversity = 
    
    return diversity

## TODO: Find the topic quality for all α-values.

In [None]:
def get_topic_quality(beta, ref_data):
    """Returns topic quality.
    """
    data = pd.DataFrame()
    num_topics = beta.shape[0]
    num_words = beta.shape[1]
    diversity = _diversity(beta[:,:], num_topics, num_words)
    coherence = topic_coherence(ref_data,beta[:, :])
    
    # TODO: calculate topic quality
    quality = 
    print('Topic Quality is: {}'.format(quality))
    data['tq'] = quality
    return data

In [None]:
def evaluation(filename, dataToEval):
    lines = []
    with open(filename) as f:
        for line in f:
            lines.append(line[0:-1].split(sep=' '))
    beta = np.array(lines)
    ref_data = pd.read_csv(dataToEval)
    results = get_topic_quality(beta, ref_data)


if __name__ == "__main__":
    refCorpus = 
    topicsPath = 
    
    evaluation(topicsPath, refCorpus)

## TODO:  What relation between α and the topic quality do you find?