<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA_Custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retrieval Assignment 2: LDA-Custom


## Runtime specs

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

## Imports

In [1]:
# Install your required packages here
!pip install pandas numpy matplotlib fsspec gcsfs dask
!pip install -q tqdm



In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import dask.dataframe as dd
from collections import OrderedDict
import random
from copy import deepcopy

from ast import literal_eval
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Mount google drive in colab:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing

In [None]:
# Pycharm:
# data = pd.read_csv('data/news_dataset.csv')

In [79]:
# Colab:
data = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/news_dataset.csv')

### Exploration

In [None]:
data.info()

In [None]:
data.head(n=43)

In [None]:
# Use document 42 as running example
data.loc[42, 'content']

### Keep document content

In [None]:
data_content = data['content']

In [None]:
type(data_content)

In [None]:
data_content.head(n=3)

### Tokenization, Stemming and Lemmatization

In [None]:
tqdm.pandas()
# Note Jens: Might want to use Dask to speed things up. 
# When using Dask can't use tqdm as far as I know.

In [None]:
# There's NaN values in the dataset
data_content.dropna(inplace=True)

In [None]:
data_content.isna().any()

In [None]:
# Tokenization
data_content_tokenized = data_content.progress_apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Remove words smaller than 3 characters
data_content_tokenized = data_content_tokenized.progress_apply(lambda x: [y for y in x if len(y)>2])

In [None]:
# Stemming and Lemmatization 
stemmer = SnowballStemmer("english")
data_content_stemmed = data_content_tokenized.progress_apply(lambda x: [stemmer.stem(WordNetLemmatizer().lemmatize(y)) for y in x])

In [None]:
# Remove Stopswords
stop_words = set(stopwords.words('english')) 
data_content_clean = data_content_stemmed.progress_apply(lambda x: [y for y in x if not y in stop_words])

In [None]:
data_content_clean.head()

In [None]:
# data_content_clean contain the cleaned 'content' column of the news dataset:
data_content_clean.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

# LDA

The following part contains our custom LDA implementation.

In [4]:
data_content_clean = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

In [5]:
data_content_clean = data_content_clean['content']

In [6]:
data_content_clean.head()

0    ['washington', 'congression', 'republican', 'n...
1    ['bullet', 'shell', 'get', 'count', 'blood', '...
2    ['walt', 'disney', 'bambi', 'open', '1942', 'c...
3    ['death', 'may', 'great', 'equal', 'necessaril...
4    ['seoul', 'south', 'korea', 'north', 'korea', ...
Name: content, dtype: object

## Utility Functions

### Token Pre-processing Function

In [57]:
def get_freq_tokens(data, num_above=0, num_under=0, most_freq=0):
  '''Will return in how many documents each token appears.
  Args:
    data (series): series object holding lists of tokens.
    num_above (int): drop tokens who appear in less than num_above documents.
    num_under (float): drop tokens who appear in more than num_under * amount of documents.
    most_freq (int): return most_freq tokens.

  Returns:
    tokens_doc_freq (dict): dictionary with key = token and value = # documents token appears in.
  '''
  tokens_doc_freq = dict()
  for row in tqdm(data, "Creating Freq. Dict: "):
    doc_words = literal_eval(row)
    doc_words = set(doc_words)
    for word in doc_words:
      if word in tokens_doc_freq:
        tokens_doc_freq[word] += 1
      else:
        tokens_doc_freq[word] = 1
  
  if num_above and num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above and v < number_of_documents*num_under}
  elif num_above:
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above}
  elif num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v < number_of_documents*num_under}
  if sorted:
    tokens_doc_freq = OrderedDict(sorted(tokens_doc_freq.items(), key=lambda x: x[1], reverse=True)[:most_freq])
  return tokens_doc_freq

In [8]:
tokens_doc_freq = get_freq_tokens(data_content_clean, num_above=15, num_under=0.5, most_freq=10000)

HBox(children=(FloatProgress(value=0.0, max=141543.0), HTML(value='')))




In [82]:
tokens_doc_freq

OrderedDict([('make', 69513),
             ('state', 68353),
             ('report', 65768),
             ('first', 65478),
             ('could', 64995),
             ('last', 64716),
             ('two', 64661),
             ('even', 63514),
             ('get', 62448),
             ('becaus', 62138),
             ('call', 60523),
             ('presid', 60156),
             ('onli', 59755),
             ('take', 59489),
             ('work', 58110),
             ('day', 57906),
             ('mani', 57322),
             ('use', 56744),
             ('befor', 56671),
             ('includ', 56643),
             ('come', 56539),
             ('want', 54971),
             ('way', 54460),
             ('ani', 54266),
             ('told', 53924),
             ('trump', 53824),
             ('back', 53543),
             ('week', 52517),
             ('nation', 52486),
             ('go', 50920),
             ('made', 50625),
             ('show', 48960),
             ('may', 48805),
    

#### Sanity Check

In [9]:
len(tokens_doc_freq)

10000

In [10]:
tokens_doc_freq['trump']

53824

### BOW Function

In [61]:
def create_bow(data, tokens):
  ''' Create a bag of words for usage in LDA.
  Args:
    data (series): series object holding lists of tokens.
    tokens (list): list of tokens to use in bag of words.

  Returns:
    documents (list): bag of words, a list of dicts.
  '''
  documents = []
  for row in tqdm(data, "Creating BOW: "):
    doc_words = literal_eval(row)
    doc_bag = dict()
    for word in doc_words:
      # check if word in tokens 
      if word in doc_bag and word in tokens:
        doc_bag[word] += 1
      elif word in tokens:
        doc_bag[word] = 1
    documents.append(doc_bag)
  return documents

In [62]:
documents = create_bow(data_content_clean, tokens_doc_freq.keys())

HBox(children=(FloatProgress(value=0.0, description='Creating BOW: ', max=141543.0, style=ProgressStyle(descri…




#### Sanity Check

In [13]:
len(documents)

141543

In [14]:
# First document has 8 occurences of Trump (same as with library)
documents[0]

{'2010': 1,
 '2015': 1,
 '2017': 1,
 'access': 1,
 'acknowledg': 1,
 'act': 1,
 'administr': 13,
 'advoc': 1,
 'afford': 1,
 'alli': 1,
 'american': 1,
 'anger': 1,
 'annual': 1,
 'anoth': 1,
 'anticip': 2,
 'appeal': 4,
 'appropri': 3,
 'approv': 1,
 'aspect': 1,
 'assert': 1,
 'author': 2,
 'avoid': 1,
 'awkward': 1,
 'backlash': 1,
 'befor': 1,
 'behalf': 1,
 'big': 1,
 'billion': 2,
 'blando': 1,
 'boehner': 1,
 'branch': 5,
 'broad': 1,
 'capitol': 1,
 'care': 8,
 'cascad': 1,
 'case': 4,
 'caus': 2,
 'central': 1,
 'challeng': 1,
 'champion': 1,
 'chao': 1,
 'choos': 2,
 'circuit': 1,
 'collyer': 1,
 'columbia': 1,
 'come': 3,
 'comment': 1,
 'committe': 1,
 'complic': 1,
 'conceiv': 1,
 'concept': 1,
 'confid': 2,
 'congress': 5,
 'congression': 3,
 'consequ': 1,
 'conserv': 1,
 'consid': 1,
 'constitut': 3,
 'consum': 1,
 'contend': 1,
 'continu': 1,
 'control': 1,
 'cost': 2,
 'could': 8,
 'court': 2,
 'coverag': 1,
 'current': 1,
 'deal': 1,
 'decis': 2,
 'deduct': 1,
 'defen

### Word Encoder

In [86]:
class WordEncoder():
  def __init__(self):
    self.id_word_encoding = {}
    self.word_id_encoding = {}

  def encode(self, documents):
    ''' Encode the words as integers.
        Args:
          documents (list): bag of words, a list of dicts

        Returns:
          dummy (list): encoded bag of words, a list of dicts
    '''
    dummy = deepcopy(documents)
    word_id = 0
    tokens = []
    for doc in documents:
      tokens.extend(doc.keys())
    tokens_uq = set(tokens)
    del tokens
    for token in tokens_uq:
      self.id_word_encoding[word_id] = token
      self.word_id_encoding[token] = word_id
      word_id += 1
    del tokens_uq
    for index, doc in enumerate(tqdm(documents, "Encoding: ")):
        for word in doc.keys():
          word_id = self.word_id_encoding[word]
          word_freq = dummy[index].pop(word)
          dummy[index][word_id] = word_freq
    return dummy
  
  def decode(self, documents):
    ''' Decode the integers to words.
        Args:
          documents (list): encoded bag of words, a list of dicts

        Returns:
          dummy (list): decoded bag of words, a list of dicts
    '''
    dummy = deepcopy(documents)
    word_id = 0
    for index, doc in enumerate(tqdm(documents, "Decoding: ")):
      for word_id in doc.keys():
        word = self.id_word_encoding[word_id]
        word_freq = dummy[index].pop(word_id)
        dummy[index][word] = word_freq
    return dummy

#### Sanity Check

In [66]:
encoder = WordEncoder()
encoded_documents = encoder.encode(documents)
decoded_documents = encoder.decode(encoded_documents)

10000


HBox(children=(FloatProgress(value=0.0, description='Encoding: ', max=141543.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Decoding: ', max=141543.0, style=ProgressStyle(descriptio…




In [37]:
decoded_documents == documents

True

## Implementation

In [83]:
def LDA(documents, vocabulary, topics=20):
  documents_len = len(documents)
  # We'll only us the x most frequent tokens
  vocabulary_len = len(tokens_doc_freq.keys())
  beta = 1/topics
  alfa = 1/topics
  # number of assignments to topic k in document i
  n_i_k = np.zeros((documents_len, topics))
  # number of assignments, corpus wide, of word w to topic k
  m_w_k = np.zeros((topics, vocabulary_len))
  # will hold number of words in each document
  n_d = np.zeros((documents_len))
  # number of assignments to topic
  n_z = np.zeros((topics))

  # z will hold the topic matrix
  z = [[0 for _ in range(len(doc))] for doc in documents]

  for doc_id, doc in enumerate(tqdm(documents, "Initializing: ")):
    for word_id, word in enumerate(doc.keys()):
      z[doc_id][word_id] = random.randrange(0, topics, 1)
      word_topic = z[doc_id][word_id]
      # number of assignments of topic: word_topic in document: doc_id
      n_i_k[doc_id][word_topic] += 1
      # number of global assigments of word: word to topic: word_topic
      m_w_k[word_topic, word] += 1
      # total number of word assignments to topic
      n_z[word_topic] += 1
      # total number of words in document
      n_d[doc_id] += 1

  for iteration in tqdm(range(10), "Optimizing: "):
    for doc_id, doc in enumerate(documents):
      for word_id, word in enumerate(doc.keys()):
              # get the topic for word n in document m
              word_topic = z[doc_id][word_id]
              
              # decrement counts for word w with associated topic z
              n_i_k[doc_id][word_topic] -= 1 
              m_w_k[word_topic][word] -= 1
              n_z[word_topic] -= 1

              # sample new topic from a multinomial according to our formula
              p_d_t = (n_i_k[doc_id] + alfa) / (n_d[doc_id] - 1 + topics * alfa) 
              p_t_w = (m_w_k[:, word] + beta) / (n_z + vocabulary_len * beta)
              p_z = p_d_t * p_t_w
              p_z /= np.sum(p_z)
              new_z = np.random.multinomial(1, p_z).argmax()

              # set z as the new topic and increment counts
              z[doc_id][word_id] = new_z
              n_i_k[doc_id][new_z] += 1
              m_w_k[new_z][word] += 1
              n_z[new_z] += 1
  return m_w_k

In [87]:
tokens_doc_freq = get_freq_tokens(data_content_clean, num_above=15, num_under=0.5, most_freq=10000)
documents = create_bow(data_content_clean, tokens_doc_freq.keys())
encoder = WordEncoder()
encoded_documents = encoder.encode(documents)

HBox(children=(FloatProgress(value=0.0, description='Creating Freq. Dict: ', max=141543.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Creating BOW: ', max=141543.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Encoding: ', max=141543.0, style=ProgressStyle(descriptio…




In [None]:
m_w_k = LDA(encoded_documents, tokens_doc_freq, topics=20)

HBox(children=(FloatProgress(value=0.0, description='Initializing: ', max=141543.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Optimizing: ', max=10.0, style=ProgressStyle(description_…

## Evaluation