In [None]:
!pip install cvae

In [2]:
!pip install --user -U nltk

Collecting nltk
  Downloading nltk-3.6.3-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.4 MB/s 
Installing collected packages: nltk
Successfully installed nltk-3.6.3


In [None]:
!pip install sentence-transformers

In [11]:

import torch.nn as nn
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision
import numpy as np
import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
from torch.utils.data import Dataset, TensorDataset,DataLoader
from sklearn.cluster import DBSCAN
# import hdbscan
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from cvae import cvae
from sklearn.cluster import KMeans
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from pprint import pprint
nltk.download('punkt')


device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')['data']

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
tm = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [6]:
class TopicModelling:
  def __init__(self,data,sentence_transformer):
    self.data = data 
    self.model = sentence_transformer
    # self.algorithm = compression_algorithim
    # self.latent_dim = compressed_dim
  
  def valid_input(self):
    if not isinstance(self.data,list):
      raise ValueError("The data passed to the model should be an array of strings")
    if len(self.data) < 1:
      raise ValueError("The data list is empty")
    if not isinstance(self.data[0],str):
      raise ValueError("The data passed to the model should be an array of strings") 
  
  def reduce_dimensions(self,size,embeddings):
    embedder = cvae.CompressionVAE(embeddings,dim_latent=size)
    embedder.train()
    z = embedder.embed(embeddings)
    
    return z

  def clusterKmeans(self,n_clusters,data):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
    return kmeans

  def class_based_tf_idf(self,documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
  def extract_top_n_words_per_topic(self,tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

  def extract_topic_sizes(self,df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes
  
  def visualize_2d_kmeans(self,embedings,n_clusters):
    z = self.reduce_dimensions(2,embedings)
    cluster = self.clusterKmeans(n_clusters,z)
    kmeans_result = pd.DataFrame(z, columns=['x', 'y'])
    kmeans_result['labels'] = cluster.labels_
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.scatter(kmeans_result.x, kmeans_result.y, c=kmeans_result.labels, s=0.05, cmap='hsv_r')
    plt.colorbar()
  
  def train(self,reduced_dim_size,n_clusters):
    self.n_clusters = n_clusters
    self.valid_input()
    self.embedings = self.model.encode(self.data, show_progress_bar=True)
    z = self.reduce_dimensions(reduced_dim_size,self.embedings)
    cluster = self.clusterKmeans(n_clusters,z)
    self.labels = cluster.labels_
    return cluster.labels_


  def get_topics(self):
    docs_df = pd.DataFrame(self.data, columns=["Doc"])
    docs_df['Topic'] = self.labels
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
    tf_idf, count = self.class_based_tf_idf(docs_per_topic.Doc.values, m=len(self.data))
    top_n_words = self.extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
    topic_sizes = self.extract_topic_sizes(docs_df); topic_sizes.head(10)
    return top_n_words,topic_sizes
  
  def get_keyords_by_topic(self,top_words,word_dict):
    topic_keywords =[]
    for i in range(self.n_clusters):
      keywords = [x[0] for x in top_words[i][:10] if x[0] in word_dict.token2id  ]
      topic_keywords.append(keywords)
    return topic_keywords
  

  



##Medium Articles

In [7]:
medium_articles = pd.read_csv("articles.csv")

In [8]:
medium_articles.shape

(337, 6)

In [12]:
def prepare_medium_articles():
  contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

  medium_articles = pd.read_csv("articles.csv")
  # medium_articles.text = medium_articles.text.apply(lambda t : t.lower())
  contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
  def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
  medium_articles.text = medium_articles.text.apply(lambda x:expand_contractions(x))
  #removing digits
  medium_articles.text = medium_articles.text.apply(lambda x: re.sub('\w*\d\w*','', x))
  
  tokeize_article = medium_articles.text.apply(lambda x : x.split())
  id2word = corpora.Dictionary(tokeize_article)
  # Create Corpus
  texts = tokeize_article
  # Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in texts]

  data = []
  for i in range(medium_articles.shape[0]):
    sents = nltk.sent_tokenize(medium_articles['text'][i])
    for sent in sents:
      data.append(sent)

  return data, id2word, corpus



In [13]:
d,worddict,corpus = prepare_medium_articles()

In [None]:
len(d)

32605

In [14]:
model = TopicModelling(d,tm)

In [None]:
# first parameter is the reduced space of the data
# second parameter is the number of topics 
model.train(5,5)

In [16]:
top_words,topic_sizes = model.get_topics()

Using the extracted topics to measure topic coherence

In [19]:
cm = CoherenceModel(topics=model.get_keyords_by_topic(top_words,worddict), corpus=corpus, dictionary=worddict, coherence='u_mass')
coherence = cm.get_coherence() 

In [20]:
# best loss so far LDA does not beat this score 
# out of the box umass score is better for this model than an unoptimised LDA model 
# The LDA model has a good umass score -1.69465075371047 
coherence

-0.427210028288895