In [63]:
%%capture
%run full_setup.py

In [64]:
from tfidf_corpus_dictionary import get_tfidf_tokendocs_corpus_dict
from gensim.models import LdaModel, LsiModel, CoherenceModel
from sklearn.decomposition import NMF, PCA
from sklearn.random_projection import GaussianRandomProjection
import numpy as np
from scipy import sparse

In [65]:
tfidf_matrix, feature_names, tokenized_docs, corpus, dictionary = get_tfidf_tokendocs_corpus_dict(df, max_df=0.5, min_df=5, max_features=1000)

In [4]:
from coherence_by_topics import coherence_by_topics
from coherence_by_words import coherence_by_words

In [5]:
topics = [5, 10, 20, 50]

In [6]:
evaluation_by_topics = {}

for n_topics in topics:
    metrics_words = coherence_by_topics(n = n_topics, corpus=corpus, dictionary=dictionary,
                               texts=tokenized_docs, feature_names=feature_names, tfidf=tfidf_matrix)
    
    evaluation_by_topics[n_topics] = metrics_words

In [7]:
evaluation_by_topics[5]

[('LDA', 0.4505),
 ('LSA', 0.4028),
 ('NMF', 0.6861),
 ('PCA', 0.4724),
 ('RP', 0.219)]

In [8]:
evaluation_by_topics[10]

[('LDA', 0.4327),
 ('LSA', 0.3684),
 ('NMF', 0.6761),
 ('PCA', 0.4148),
 ('RP', 0.2127)]

In [9]:
evaluation_by_topics[20]

[('LDA', 0.402),
 ('LSA', 0.3546),
 ('NMF', 0.6258),
 ('PCA', 0.3384),
 ('RP', 0.2095)]

In [10]:
evaluation_by_topics[50]

[('LDA', 0.3503),
 ('LSA', 0.3283),
 ('NMF', 0.5271),
 ('PCA', 0.3126),
 ('RP', 0.2034)]

In [11]:
words = [10, 100, 1000, 10000]

In [12]:
evaluation_by_words = {}

for n_words in words:
    metrics_words = coherence_by_words(df, n = n_words)
    evaluation_by_words[n_words] = metrics_words

In [13]:
evaluation_by_words[10]

[('LDA', 0.4636),
 ('LSA', 0.4636),
 ('NMF', 0.4636),
 ('PCA', 0.4636),
 ('RP', 0.4636)]

In [14]:
evaluation_by_words[100]

[('LDA', 0.4312),
 ('LSA', 0.4129),
 ('NMF', 0.4591),
 ('PCA', 0.3411),
 ('RP', 0.4035)]

In [15]:
evaluation_by_words[1000]

[('LDA', 0.4505),
 ('LSA', 0.4028),
 ('NMF', 0.6861),
 ('PCA', 0.4724),
 ('RP', 0.219)]

In [16]:
evaluation_by_words[10000]

[('LDA', 0.5758),
 ('LSA', 0.6189),
 ('NMF', 0.7256),
 ('PCA', 0.6201),
 ('RP', 0.6917)]

In [20]:
from tables import tables

In [22]:
tables(evaluation_by_topics, 'topics', path)

In [23]:
tables(evaluation_by_words, 'words', path)

In [None]:
from plots import plots

In [None]:
plots(evaluation_by_topics, 'topics', path)

In [None]:
plots(evaluation_by_words, 'words', path)

In [66]:
from display_topics import display_topics

In [67]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5,
                     alpha='symmetric', eta='auto', passes=5, random_state=1)

In [68]:
display_topics('LDA', lda_model, feature_names)

Topic 1: government, gun, state, year, apr, law, car, clinton, tax, drug, netcomcom, ca, case, work, distribution
Topic 2: god, israel, christian, israeli, armenian, jew, jesus, believe, turkish, arab, church, bible, kill, religion, apr
Topic 3: space, email, post, xnewsreader, computer, pl, bank, tin, science, gordon, distribution, information, version, mail, system
Topic 4: window, drive, bike, card, work, henry, problem, system, sale, program, file, run, computer, email, monitor
Topic 5: game, team, player, pat, play, win, hockey, season, baseball, year, fan, score, league, wing, playoff


In [69]:
lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=5, random_seed = 1)

In [70]:
display_topics('LSA', lsi_model, feature_names)

Topic 1: apr, system, work, year, problem, window, distribution, god, computer, drive, run, file, car, state, world
Topic 2: window, god, file, card, drive, driver, christian, program, jesus, believe, system, email, disk, problem, video
Topic 3: game, team, god, player, win, play, year, season, hockey, christian, jesus, score, baseball, fan, file
Topic 4: window, god, key, chip, file, car, clipper, game, encryption, government, christian, team, jesus, win, program
Topic 5: drive, car, key, file, window, clipper, chip, encryption, scsi, card, sale, god, program, bike, government


In [71]:
nmf_model = NMF(n_components=5, random_state=1).fit(tfidf_matrix)

In [72]:
display_topics('NMF', nmf_model, feature_names)

Topic 1: key, apr, government, state, year, car, gun, chip, system, distribution, law, work, clipper, ca, space
Topic 2: window, file, program, run, version, help, problem, application, image, graphic, driver, email, display, server, manager
Topic 3: game, team, win, player, play, year, season, hockey, score, baseball, fan, nhl, playoff, league, run
Topic 4: god, christian, jesus, believe, bible, christ, faith, church, atheist, life, religion, belief, truth, christianity, sin
Topic 5: drive, card, sale, scsi, mb, disk, monitor, mac, driver, video, system, problem, hard, controller, work


In [73]:
tfidf_matrix_dense = tfidf_matrix.todense() if sparse.issparse(tfidf_matrix) else tfidf_matrix

# Convert to numpy array
tfidf_matrix_array = np.asarray(tfidf_matrix_dense)

# Centering
mean_tfidf = np.mean(tfidf_matrix_array, axis=0)  # Calculate the mean of each column
centered_tfidf_matrix = tfidf_matrix_array - mean_tfidf

pca_model = PCA(n_components=5, random_state=1).fit(centered_tfidf_matrix)

In [74]:
display_topics('PCA', pca_model, feature_names)

Topic 1: god, christian, jesus, believe, year, law, bible, team, life, jew, church, religion, christ, faith, game
Topic 2: game, team, player, win, play, year, season, hockey, score, baseball, fan, nhl, playoff, league, toronto
Topic 3: key, chip, car, clipper, encryption, government, gun, system, phone, drive, algorithm, buy, netcomcom, public, law
Topic 4: drive, car, scsi, sale, card, god, bike, mb, price, buy, hard, ide, speed, controller, disk
Topic 5: god, key, chip, card, game, drive, scsi, clipper, encryption, system, team, mb, jesus, christian, disk


In [75]:
rp_model = GaussianRandomProjection(n_components=5, random_state=1).fit(tfidf_matrix)

In [76]:
display_topics('RP', rp_model, feature_names)

Topic 1: modem, recent, explain, communication, power, decide, step, side, spend, comment, claim, attack, georgia, hardware, application
Topic 2: output, definition, fit, hand, american, however, political, average, certainly, create, replace, wm, price, illinois, select
Topic 3: handle, sit, single, describe, red, design, center, development, range, score, charge, later, bike, report, email
Topic 4: apply, tim, software, tin, suspect, wm, express, inside, uunet, feel, chicago, suppose, aid, memory, video
Topic 5: build, hell, defense, muslim, month, radio, men, several, depend, company, data, half, dave, flame, yous
