In [14]:
import os
import pandas as pd
import numpy as np

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [8]:
aclImdb_path = os.path.join("D:\\", "program", "programming", "study", "data", "aclImdb_v1")
reviews_train = load_files(os.path.join(aclImdb_path, "train"))
reviews_test = load_files(os.path.join(aclImdb_path, "test"))

In [9]:
df_train = pd.DataFrame({"data": reviews_train["data"],
                         "target": reviews_train["target"]})

df_train = df_train.drop(df_train[df_train["target"] == 2].index, axis=0)
df_train["data"] = df_train["data"].str.decode('utf-8').str.replace("b['\"]|'|<br />", "", regex=True)

In [10]:
df_test = pd.DataFrame({"data": reviews_test["data"],
                        "target": reviews_test["target"]})
df_test = df_test.drop(df_test[df_test["target"] == 2].index, axis=0)
df_test["data"] = df_test["data"].str.decode('utf-8').str.replace("b['\"]|'|<br />", "", regex=True)

In [11]:
vect = CountVectorizer(max_features=10000, max_df=0.15)

X = vect.fit_transform(df_train["data"])

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0)

document_topics = lda.fit_transform(X)

In [13]:
lda.components_.shape

(10, 10000)

In [15]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names_out())

In [16]:
import mglearn
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
performance   work          world         im            john          
role          director      us            didnt         role          
actors        seems         war           worst         cast          
actor         interesting   our           thing         wife          
cast          rather        family        nothing       woman         
performances  however       real          actors        young         
excellent     quite         years         actually      murder        
played        audience      american      minutes       plays         
plays         doesnt        own           ive           played        
play          between       documentary   cant          michael       


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
game

In [17]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [41]:
pipe_latent = make_pipeline(
    CountVectorizer(stop_words="english"),
    LatentDirichletAllocation(n_components=10, random_state=42),
    LogisticRegression(max_iter=300)
)

pipe_latent.fit(df_train["data"], df_train["target"])

In [39]:
pipe_no_latent = make_pipeline(
    CountVectorizer(stop_words="english"),
    LogisticRegression(max_iter=300)
)

pipe_no_latent.fit(df_train["data"], df_train["target"])

In [37]:
pipe_ngram = make_pipeline(
    CountVectorizer(stop_words="english", ngram_range=(1, 2)),
    LogisticRegression(max_iter=300)
)

pipe_ngram.fit(df_train["data"], df_train["target"])

In [42]:
score_latent = pipe_latent.score(df_test["data"], df_test["target"])
score_no_latent = pipe_no_latent.score(df_test["data"], df_test["target"])
score_ngram = pipe_ngram.score(df_test["data"], df_test["target"])

print("score_latent", score_latent)
print("score_no_latent", score_no_latent)
print("score_ngram", score_ngram)

score_latent 0.70184
score_no_latent 0.86048
score_ngram 0.88
