<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA-Custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retrieval Assignment 2: LDA


## Runtime specs

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

## Imports

In [80]:
# Install your required packages here
!pip install pandas numpy matplotlib fsspec gcsfs dask
!pip install -q tqdm



In [88]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import dask.dataframe as dd
from collections import OrderedDict

from ast import literal_eval
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [82]:
# Mount google drive in colab:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing

In [None]:
# Pycharm:
# data = pd.read_csv('data/news_dataset.csv')

In [None]:
# Colab:
data = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/news_dataset.csv')

### Exploration

In [None]:
data.info()

In [None]:
data.head(n=43)

In [None]:
# Use document 42 as running example
data.loc[42, 'content']

### Keep document content

In [None]:
data_content = data['content']

In [None]:
type(data_content)

In [None]:
data_content.head(n=3)

### Tokenization, Stemming and Lemmatization

In [None]:
tqdm.pandas()
# Note Jens: Might want to use Dask to speed things up. 
# When using Dask can't use tqdm as far as I know.

In [None]:
# There's NaN values in the dataset
data_content.dropna(inplace=True)

In [None]:
data_content.isna().any()

In [None]:
# Tokenization
data_content_tokenized = data_content.progress_apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Remove words smaller than 3 characters
data_content_tokenized = data_content_tokenized.progress_apply(lambda x: [y for y in x if len(y)>2])

In [None]:
# Stemming and Lemmatization 
stemmer = SnowballStemmer("english")
data_content_stemmed = data_content_tokenized.progress_apply(lambda x: [stemmer.stem(WordNetLemmatizer().lemmatize(y)) for y in x])

In [None]:
# Remove Stopswords
stop_words = set(stopwords.words('english')) 
data_content_clean = data_content_stemmed.progress_apply(lambda x: [y for y in x if not y in stop_words])

In [None]:
data_content_clean.head()

In [None]:
# data_content_clean contain the cleaned 'content' column of the news dataset:
data_content_clean.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

# LDA

The following part contains our custom LDA implementation.

In [83]:
data_content_clean = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

In [84]:
data_content_clean = data_content_clean['content']

In [85]:
data_content_clean.head()

0    ['washington', 'congression', 'republican', 'n...
1    ['bullet', 'shell', 'get', 'count', 'blood', '...
2    ['walt', 'disney', 'bambi', 'open', '1942', 'c...
3    ['death', 'may', 'great', 'equal', 'necessaril...
4    ['seoul', 'south', 'korea', 'north', 'korea', ...
Name: content, dtype: object

## Token Pre-processing

In [86]:
def get_freq_tokens(data, num_above=0, num_under=0, most_freq=0):
  '''Will return in how many documents each token appears.
  Args:
    data (series): series object holding lists of tokens.
    num_above (int): drop tokens who appear in less than num_above documents.
    num_under (float): drop tokens who appear in more than num_under * amount of documents.
    most_freq (int): return most_freq tokens.

  Returns:
    tokens_doc_freq (dict): dictionary with key = token and value = # documents token appears in.
  '''
  tokens_doc_freq = dict()
  for row in tqdm(data):
    doc_words = literal_eval(row)
    doc_words = set(doc_words)
    for word in doc_words:
      if word in tokens_doc_freq:
        tokens_doc_freq[word] += 1
      else:
        tokens_doc_freq[word] = 1
  
  if num_above and num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above and v < number_of_documents*num_under}
  elif num_above:
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v > num_above}
  elif num_under:
    number_of_documents = len(data)
    tokens_doc_freq = {k: v for k,v in tokens_doc_freq.items() if v < number_of_documents*num_under}
  if sorted:
    tokens_doc_freq = OrderedDict(sorted(tokens_doc_freq.items(), key=lambda x: x[1], reverse=True)[:most_freq])
  return tokens_doc_freq

In [90]:
tokens_doc_freq = get_freq_tokens(data_content_clean, num_above=15, num_under=0.5, most_freq=10000)

HBox(children=(FloatProgress(value=0.0, max=141543.0), HTML(value='')))




### Sanity Check

In [91]:
len(tokens_doc_freq)

10000

In [92]:
tokens_doc_freq['trump']

53824

## BOW

In [None]:
def create_bow(data, tokens):
  ''' Create a bag of words for usage in LDA.
  Args:
    data (series): series object holding lists of tokens.
    tokens (list): list of tokens to use in bag of words.

  Returns:
    documents (list): bag of words, a list of dicts
  '''
  documents = []
  for row in tqdm(data):
    doc_words = literal_eval(row)
    doc_bag = dict()
    for word in doc_words:
      if word in doc_bag and word in tokens:
        doc_bag[word] += 1
      else:
        doc_bag[word] = 1
    documents.append(doc_bag)
  return documents

In [None]:
create_bow(data_content_clean, tokens_doc_freq.keys())

## Evaluation

In [None]:
# Let's look at a test document: document 42
# 1) clearly see that the article is about the economy 40%
# 2) US politics 27%
# 3) US Justice Department 8%
# 4) US Congress 6%
for index, score in sorted(lda_model[bow_corpus[42]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))