# 0.5 LDA

This notebook will build a LDA model from the tokenized statements.

In [6]:
import os

import pandas as pd
import engarde.decorators as ed
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump

In [7]:
PROJ_ROOT = os.path.join(os.pardir)

In [10]:
def load_data():
    read_path = os.path.join(PROJ_ROOT + "/data/processed/" + "tokenized.feather")

    df = pd.read_feather(read_path)

    return df

In [11]:
tokenized = load_data()

In [12]:
tokenized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124089 entries, 0 to 124088
Data columns (total 7 columns):
site_name           124089 non-null object
documentid          124089 non-null object
customquestionid    124089 non-null int64
questiontext        124089 non-null object
answertext          124089 non-null object
submissiondate      124089 non-null datetime64[ns]
proc_answers        124089 non-null object
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 6.6+ MB


In [13]:
tfidf_vectorizer = CountVectorizer(min_df=5, max_df=0.9)

tfidf = tfidf_vectorizer.fit_transform(tokenized["proc_answers"])

In [14]:
write_path = os.path.join(PROJ_ROOT + "/models/" + "tfidf" + ".joblib")
dump(tfidf, write_path)

['../models/tfidf.joblib']

In [15]:
%%time

N_TOPICS = 20

lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=42)

topic_model = lda.fit(tfidf)

CPU times: user 3min 23s, sys: 2.36 s, total: 3min 26s
Wall time: 3min 35s


In [16]:
write_path = os.path.join(PROJ_ROOT + "/models/" + "lda_" + str(N_TOPICS) + ".joblib")
dump(topic_model, write_path)

['../models/lda_20.joblib']