<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA_Custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retrieval Assignment 2: LDA-Custom


## Runtime specs

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

## Imports

In [None]:
# Install your required packages here
!pip install pandas numpy matplotlib fsspec gcsfs dask
!pip install -q tqdm

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import dask.dataframe as dd
from collections import OrderedDict
import random
from copy import deepcopy

from ast import literal_eval
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Mount google drive in colab:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

## Preprocessing

In [None]:
# Pycharm:
# data = pd.read_csv('data/news_dataset.csv')

In [None]:
# Colab:
data = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/news_dataset.csv')

### Exploration

In [None]:
data.info()

In [None]:
data.head(n=43)

In [None]:
# Use document 42 as running example
data.loc[42, 'content']

### Keep document content

In [None]:
data_content = data['text']

In [None]:
type(data_content)

In [None]:
data_content.head(n=3)

### Tokenization, Stemming and Lemmatization

In [None]:
tqdm.pandas()
# Note Jens: Might want to use Dask to speed things up. 
# When using Dask can't use tqdm as far as I know.

In [None]:
# There's NaN values in the dataset
data_content.dropna(inplace=True)

In [None]:
data_content.isna().any()

In [None]:
# Tokenization
data_content_tokenized = data_content.progress_apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Remove words smaller than 3 characters
data_content_tokenized = data_content_tokenized.progress_apply(lambda x: [y for y in x if len(y)>2])

In [None]:
# Stemming and Lemmatization 
stemmer = SnowballStemmer("english")
data_content_stemmed = data_content_tokenized.progress_apply(lambda x: [stemmer.stem(WordNetLemmatizer().lemmatize(y)) for y in x])

In [None]:
# Remove Stopswords
stop_words = set(stopwords.words('english')) 
data_content_clean = data_content_stemmed.progress_apply(lambda x: [y for y in x if not y in stop_words])

In [None]:
data_content_clean.head()

In [None]:
# data_content_clean contain the cleaned 'content' column of the news dataset:
data_content_clean.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

# LDA

In [None]:
iterations = 10
topicAmount = 20

The following part contains our custom LDA implementation.

In [None]:
data_content_clean = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

In [None]:
data_content_clean = data_content_clean['content']

In [None]:
data_content_clean.head()

## Utility Functions

### Token Pre-processing Function

In [None]:
def get_freq_tokens(data, num_above=0, num_under=0, most_freq=0):
  '''Will return in how many documents each token appears.
  Args:
    data (series): series object holding lists of tokens.
    num_above (int): drop tokens who appear in less than num_above documents.
    num_under (float): drop tokens who appear in more than num_under * amount of documents.
    most_freq (int): return most_freq tokens.

  Returns:
    tokens_doc_freq (dict): dictionary with key = token and value = # documents token appears in.
  '''
  tokens_doc_freq = dict()
  for row in tqdm(data, "Creating Freq. Dict: "):
    doc_words = literal_eval(row)
    doc_words = set(doc_words)
    for word in doc_words:
      if word in tokens_doc_freq:
        tokens_doc_freq[word] += 1
      else:
        tokens_doc_freq[word] = 1
  
  if num_above and num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above and v < number_of_documents*num_under}
  elif num_above:
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above}
  elif num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v < number_of_documents*num_under}
  if sorted:
    tokens_doc_freq = OrderedDict(sorted(tokens_doc_freq.items(), key=lambda x: x[1], reverse=True)[:most_freq])
  return tokens_doc_freq

In [None]:
tokens_doc_freq = get_freq_tokens(data_content_clean, num_above=15, num_under=0.5, most_freq=10000)

In [None]:
tokens_doc_freq

#### Sanity Check

In [None]:
len(tokens_doc_freq)

In [None]:
tokens_doc_freq['trump']

### BOW Function

In [None]:
def create_bow(data, tokens):
  ''' Create a bag of words for usage in LDA.
  Args:
    data (series): series object holding lists of tokens.
    tokens (list): list of tokens to use in bag of words.

  Returns:
    documents (list): bag of words, a list of dicts.
  '''
  documents = []
  for row in tqdm(data, "Creating BOW: "):
    doc_words = literal_eval(row)
    doc_bag = dict()
    for word in doc_words:
      # check if word in tokens 
      if word in doc_bag and word in tokens:
        doc_bag[word] += 1
      elif word in tokens:
        doc_bag[word] = 1
    documents.append(doc_bag)
  return documents

In [None]:
documents = create_bow(data_content_clean, tokens_doc_freq.keys())

#### Sanity Check

In [None]:
len(documents)

In [None]:
# First document has 8 occurences of Trump (same as with library)
documents[0]

### Word Encoder

In [None]:
class WordEncoder():
  def __init__(self):
    self.id_word_encoding = {}
    self.word_id_encoding = {}

  def encode(self, documents):
    ''' Encode the words as integers.
        Args:
          documents (list): bag of words, a list of dicts

        Returns:
          dummy (list): encoded bag of words, a list of dicts
    '''
    dummy = deepcopy(documents)
    word_id = 0
    tokens = []
    for doc in documents:
      tokens.extend(doc.keys())
    tokens_uq = set(tokens)
    del tokens
    for token in tokens_uq:
      self.id_word_encoding[word_id] = token
      self.word_id_encoding[token] = word_id
      word_id += 1
    del tokens_uq
    for index, doc in enumerate(tqdm(documents, "Encoding: ")):
        for word in doc.keys():
          word_id = self.word_id_encoding[word]
          word_freq = dummy[index].pop(word)
          dummy[index][word_id] = word_freq
    return dummy
  
  def decode(self, documents):
    ''' Decode the integers to words.
        Args:
          documents (list): encoded bag of words, a list of dicts

        Returns:
          dummy (list): decoded bag of words, a list of dicts
    '''
    dummy = deepcopy(documents)
    word_id = 0
    for index, doc in enumerate(tqdm(documents, "Decoding: ")):
      for word_id in doc.keys():
        word = self.id_word_encoding[word_id]
        word_freq = dummy[index].pop(word_id)
        dummy[index][word] = word_freq

  def decodeWord(self, word_id):
    return self.id_word_encoding[word_id]

#### Sanity Check

In [None]:
encoder = WordEncoder()
encoded_documents = encoder.encode(documents)
decoded_documents = encoder.decode(encoded_documents)

In [None]:
decoded_documents == documents

## Implementation

In [None]:
class LDAmodel:
      def __init__(self, documents, vocabulary, topics=topicAmount):
        documents_len = len(documents)
        # We'll only us the x most frequent tokens
        self.vocabulary_len = len(tokens_doc_freq.keys())
        self.beta = 1 / topics
        self.alfa = 1 / topics
        # number of assignments to topic k in document i
        self.n_i_k = np.zeros((documents_len, topics))
        # number of assignments, corpus wide, of word w to topic k
        self.m_w_k = np.zeros((topics, self.vocabulary_len))
        # will hold number of words in each document
        self.n_d = np.zeros((documents_len))
        # number of assignments to topic
        self.n_z = np.zeros((topics))

        # z will hold the topic matrix
        self.z = [[0 for _ in range(len(doc))] for doc in documents]

        self.topics = topics
        self.documents = documents

        for doc_id, doc in enumerate(tqdm(self.documents, "Initializing: ")):
          for word_id, word in enumerate(doc.keys()):
            self.z[doc_id][word_id] = random.randrange(0, self.topics, 1)
            word_topic = self.z[doc_id][word_id]
            # number of assignments of topic: word_topic in document: doc_id
            self.n_i_k[doc_id][word_topic] += 1
            # number of global assigments of word: word to topic: word_topic
            self.m_w_k[word_topic, word] += 1
            # total number of word assignments to topic
            self.n_z[word_topic] += 1
            # total number of words in document
            self.n_d[doc_id] += 1



      def runLDA(self, iterations=10):
        for iteration in tqdm(range(iterations), "Optimizing: "):
          for doc_id, doc in enumerate(self.documents):
            for word_id, word in enumerate(doc.keys()):
                    # get the topic for word n in document m
                    word_topic = self.z[doc_id][word_id]
                    
                    # decrement counts for word w with associated topic z
                    self.n_i_k[doc_id][word_topic] -= 1 
                    self.m_w_k[word_topic][word] -= 1
                    self.n_z[word_topic] -= 1

                    # sample new topic from a multinomial according to our formula
                    p_d_t = (self.n_i_k[doc_id] + self.alfa) / (self.n_d[doc_id] - 1 + self.topics * self.alfa) 
                    p_t_w = (self.m_w_k[:, word] + self.beta) / (self.n_z + self.vocabulary_len * self.beta)
                    p_z = p_d_t * p_t_w
                    p_z /= np.sum(p_z)
                    new_z = np.random.multinomial(1, p_z).argmax()

                    # set z as the new topic and increment counts
                    self.z[doc_id][word_id] = new_z
                    self.n_i_k[doc_id][new_z] += 1
                    self.m_w_k[new_z][word] += 1
                    self.n_z[new_z] += 1

      def getTopicsPerDocument(self):
        return self.n_i_k

      def getWordsPerTopic(self):
        return self.m_w_k

In [None]:
tokens_doc_freq = get_freq_tokens(data_content_clean, num_above=15, num_under=0.5, most_freq=10000)
documents = create_bow(data_content_clean, tokens_doc_freq.keys())
encoder = WordEncoder()
encoded_documents = encoder.encode(documents)

In [None]:
LDA = LDAmodel(encoded_documents, tokens_doc_freq, topics=20)
LDA.runLDA()

m_w_k = LDA.getWordsPerTopic()
n_i_k = LDA.getTopicsPerDocument()

## Evaluation

In [None]:
topicList = ["Topic "+str(i) for i in range(20)]

df = pd.DataFrame(columns=topicList)

for i in range(20):
  indices = np.argpartition(m_w_k[i],-10)[-10:]

  min_elements = m_w_k[i][indices]
  min_elements_order = np.argsort(-min_elements)
  indices = indices[min_elements_order]
  topic = ""
  topTopicWords = []
  for ind in indices:
    topic = topic + str(m_w_k[i][ind]/np.sum(m_w_k[i])) + "*\"" + encoder.decodeWord(ind) + "\"  "
    topTopicWords.append(encoder.decodeWord(ind))
  df["Topic "+ str(i)] = topTopicWords
  print('Topic: {} \nWords: {}\n\n'.format(i, topic))


df.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/analysis_topics'+ str(iterations) +'.csv')

In [None]:
# Decide the most representative documents for each topic and write it to file

repList = [[] for i in range(20)]

for doc in tqdm(range(len(n_i_k)), "Scoring: "):
  if sum(n_i_k[doc]) != 0:
    scores = n_i_k[doc]/sum(n_i_k[doc])

  for topic in range(len(scores)):
    if scores[topic] > 0:
      repList[topic].append((doc, scores[topic]))

for topic in repList:
  topic.sort(key=lambda x:x[1], reverse=True)
print(repList[0])

topicList = ["Topic "+str(i) for i in range(20)]

df = pd.DataFrame(columns=topicList)

for topic in range(len(topicList)):
  end = 100
  if len(repList[topic]) < 100:
    end = len(repList[topic])-1
  df["Topic "+ str(topic)] = [x[0] for x in repList[topic][0:end]]

print(df)

df.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/topic_document_rank_custom'+ str(iterations) +'.csv')



In [None]:
### see how divided a document is between topics and write information to file:

file = open("/content/drive/MyDrive/IR-Assignment-2/data/analysis_topic_per_doc"+ str(iterations) +".txt","w") 

for i in range(10):
  scores = n_i_k[i]/sum(n_i_k[i])
  file.write("doc " + str(i) + " topics: \n")
  for score in sorted(scores, reverse=True):
    file.write(str(score) + "   ")
  file.write("\n\n")

file.close()