# Latent Dirichlet Allocation (LDA)

## Imports

In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
url = 'https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/10-Natural-Language-Processing/lda_data'
data = pd.read_csv(url, sep=",", header=None)
data.columns = ['text']
data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


In [3]:
data.shape

(1199, 1)

## Preprocessing 

In [4]:
def preprocessing(text:str):
    text = text.strip()
    text = text.lower()
    text = "".join(char for char in text if not char.isdigit())
    text = "".join([char for char in text if char not in string.punctuation])
    tokenized = word_tokenize(text) 
    stop_words = set(stopwords.words("english"))
    tokens_cleaned = [word for word in tokenized if not word in stop_words]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="v") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="s") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="n") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="a") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="r") for token in tokens_cleaned]
    
    return " ".join(word for word in lemmatized)

In [5]:
data["clean_text"] = data["text"].apply(preprocessing)
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,gldcunixbcccolumbiaedu gary l dare subject sta...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,minerkuhubccukansedu subject ancient books org...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,vzhivovsuperiorcarletonca vladimir zhivov subj...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,jerrybeskimocom jerry kaufman subject prayers ...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,golchowyalchemychemutorontoca gerald olchowy s...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,jaynemmaltguildorg jayne kulikauskas subject q...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,sclarkepasutorontoca susan clark subject picks...


## Latent Dirichlet Allocation model

In [6]:
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(data["clean_text"])
vectorized_documents = pd.DataFrame(vectorized_documents.toarray(), 
                                    columns = vectorizer.get_feature_names_out())
vectorized_documents

Unnamed: 0,aa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuugggggggggggggggg,aacc,aadams,aafreenetcarletonca,aargh,aaron,aaronbinahccbrandeisedu,aaroncathenamitedu,aarons,...,zone,zones,zoo,zoomed,zorasterism,zubov,zupancic,zurich,zwart,zzzzzz
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.084477,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.072591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1195,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1196,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1197,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
lda_model = LatentDirichletAllocation(
    n_components=2,
    max_iter=100
)
lda_model.fit(vectorized_documents)

## Visualize potential topics

In [8]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [9]:
print_topics(lda_model, vectorizer)

Topic 0:
[('god', 29.935881156678292), ('would', 25.820957995858624), ('one', 23.03529629264687), ('subject', 22.444315889159515), ('organization', 21.561790770846976), ('university', 21.489052532123623), ('lines', 21.48783691504057), ('writes', 20.41482239110005), ('people', 20.384879800379434), ('game', 19.570655693280912)]
Topic 1:
[('testing', 1.4443946707915405), ('rfl', 1.111981862579948), ('tennessee', 1.111981862579948), ('khettryrwpubutkedu', 1.111981862579948), ('dee', 0.8977828370784797), ('howell', 0.8890814303934456), ('utk', 0.7803468811935178), ('addresses', 0.7542750155386009), ('basingstoke', 0.6970077875746488), ('peterborough', 0.6970077875746488)]


## Predict the document-topic mixture of a new text

In [10]:
example = ["My team performed poorly last season. Their best player was out injured and only played one game"]

In [11]:
vectorizer = TfidfVectorizer()
vectorized_document = vectorizer.fit_transform(example)
vectorized_document = pd.DataFrame(vectorized_document.toarray(), 
                                    columns = vectorizer.get_feature_names_out())

vectorized_document

Unnamed: 0,and,best,game,injured,last,my,one,only,out,performed,played,player,poorly,season,team,their,was
0,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536,0.242536


In [12]:
lda_model.fit_transform(vectorized_document)

array([[0.1358838, 0.8641162]])