<a href="https://colab.research.google.com/github/Djensonsan/Information_Retrieval_Assignment_2/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retrieval Assignment 2: LDA


## Runtime specs

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

## Imports

In [None]:
# Install your required packages here
!pip install pandas numpy matplotlib fsspec gcsfs dask
!pip install -q tqdm

In [None]:
import pandas as pd
import numpy as np
import tqdm.notebook as tqdm
import dask.dataframe as dd

from ast import literal_eval
import gensim
from gensim import corpora, models
from tqdm import tqdm
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Mount google drive in colab:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

## Preprocessing

In [None]:
# Pycharm:
# data = pd.read_csv('data/news_dataset.csv')

In [None]:
# Colab:
data = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/news_dataset.csv')

### Exploration

In [None]:
data.info()

In [None]:
data.head(n=43)

In [None]:
# Use document 42 as running example
data.loc[42, 'content']

### Keep document content

In [None]:
data_content = data['content']

In [None]:
type(data_content)

In [None]:
data_content.head(n=3)

### Tokenization, Stemming and Lemmatization

In [None]:
tqdm.pandas()
# Note Jens: Might want to use Dask to speed things up.
# When using Dask can't use tqdm as far as I know.

In [None]:
# There's NaN values in the dataset
data_content.dropna(inplace=True)

In [None]:
data_content.isna().any()

In [None]:
# Tokenization
data_content_tokenized = data_content.progress_apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Remove words smaller than 3 characters
data_content_tokenized = data_content_tokenized.progress_apply(lambda x: [y for y in x if len(y)>2])

In [None]:
# Stemming and Lemmatization
stemmer = SnowballStemmer("english")
data_content_stemmed = data_content_tokenized.progress_apply(lambda x: [stemmer.stem(WordNetLemmatizer().lemmatize(y)) for y in x])

In [None]:
# Remove Stopswords
stop_words = set(stopwords.words('english'))
data_content_clean = data_content_stemmed.progress_apply(lambda x: [y for y in x if not y in stop_words])

In [None]:
data_content_clean.head()

In [None]:
# data_content_clean contain the cleaned 'content' column of the news dataset:
data_content_clean.to_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

# LDA

The following part will use Python libraries for performing an LDA. The result can be used for comparison with our model.

The code follows a tutorial:
[LDA Tutorial](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [None]:
data_content_clean = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/new_dataset_clean.csv')

In [None]:
data_content_clean = data_content_clean['content']

In [None]:
data_content_clean.head()

In [None]:
dictionary = gensim.corpora.Dictionary()
for article in tqdm(data_content_clean):
  dictionary.add_documents([literal_eval(article)])

## Sanity check

In [None]:
print(dictionary)

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
# Filter out tokens that appear in less than 15 documents, more than 0.5 documents (fraction of total corpus size, not absolute number).
# keep only the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## doc2bow

For each document we create a dictionary reporting how many words and how many times those words appear.

In [None]:
bow_corpus = [dictionary.doc2bow(literal_eval(doc)) for doc in data_content_clean]
bow_corpus[42]

In [None]:
bow_doc_42 = bow_corpus[42]
for i in range(len(bow_doc_42)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_42[i][0],
                                               dictionary[bow_doc_42[i][0]],
bow_doc_42[i][1]))

## TF-IDF

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    print(doc)
    break

## LDA Library: gensim.models.LdaMulticore

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

## Evaluation

In [None]:
# Let's look at a test document: document 42
# 1) clearly see that the article is about the economy 40%
# 2) US politics 27%
# 3) US Justice Department 8%
# 4) US Congress 6%
for index, score in sorted(lda_model[bow_corpus[42]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))