In [None]:
from topicsfinder import TopicsFinder
from textfilereader import TextFileReader

In [None]:
reader = TextFileReader('./sample_data/CSS_Hiring_Data_FedEmployee_Reason_OTHER_v1.xlsx')
data = reader.get_dataframe('Reason for filling position(s) with Federal Government Employee -OTHER')
data.head()

In [None]:
# num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.


In [None]:
finder = TopicsFinder(data)


In [None]:
import numpy as np
import pandas as pd
import tqdm
import gensim

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
# alpha = [0.01]
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
# beta = [0.01]
beta.append('symmetric')
# Validation sets
corpus = finder.corpus
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    # cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                    #                               k=k, a=a, b=b)
                    mod, cv = finder.fit_LDA_model(
                        random_state=100,
                        chunksize=chunksize,
                        passes=passes,
                        iterations=iterations,
                        eval_every=eval_every,
                        num_topics = k,
                        alpha = a,
                        eta = b,

                    )
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv.get_coherence())
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

In [1]:
from topicsfinder import TopicsFinder
from textfilereader import TextFileReader
import optuna
import numpy as np

reader = TextFileReader('./sample_data/CSS_Hiring_Data_FedEmployee_Reason_OTHER_v1.xlsx')
data = reader.get_dataframe('Reason for filling position(s) with Federal Government Employee -OTHER')
# reader = TextFileReader('./sample_data/data.xlsx')
# data = reader.get_dataframe('Please briefly describe an example of one burdensome administrative task or process which you believe is "low value"')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kapangyu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kapangyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kapangyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class StopWhenTrialKeepBeingPrunedCallback:
    def __init__(self, threshold: int):
        self.threshold = threshold
        self._consequtive_pruned_count = 0

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if trial.state == optuna.trial.TrialState.PRUNED:
            self._consequtive_pruned_count += 1
        else:
            self._consequtive_pruned_count = 0

        if self._consequtive_pruned_count >= self.threshold:
            study.stop()

In [3]:
def objective(trial):
    k = trial.suggest_int('num_topics', 1, 10)
    a = trial.suggest_categorical('alpha', list(np.arange(0.01, 1, 0.3)) + ['symmetric','asymmetric'])
    b = trial.suggest_categorical('eta', list(np.arange(0.01, 1, 0.3)) + ['symmetric'])
    # a = 'auto'
    # b = 'auto'
    chunksize = trial.suggest_int('chunksize', 100, 2000, step=100)
    passes = trial.suggest_int('passes', 1, 10, step=2)
    iterations = trial.suggest_int('iterations', 50, 500, step=50)
    eval_every = None  

    finder = TopicsFinder(data)
    _, cv = finder.fit_LDA_model(
        random_state=100,
        chunksize=chunksize,
        passes=passes,
        iterations=iterations,
        eval_every=eval_every,
        num_topics = k,
        alpha = a,
        eta = b,
    )
    score = cv.get_coherence()

    trial.report(score, 0)
    # Handle pruning based on the intermediate value.
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return score


In [6]:
import logging

optuna.logging.get_logger("optuna").addHandler(logging.handlers.RotatingFileHandler("optuna.log",maxBytes=100000,backupCount=3))

study_stop_cb = StopWhenTrialKeepBeingPrunedCallback(3)
# 3. Create a study object and optimize the objective function.
study_name = "topicsanalyser-study"
storage_name = f"sqlite:///{study_name}.db"

study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(objective, n_trials=100, callbacks=[study_stop_cb])

[32m[I 2021-02-09 10:00:55,643][0m A new study created in RDB with name: topicsanalyser-study[0m
[32m[I 2021-02-09 10:01:21,080][0m Trial 0 finished with value: 0.3864179810644952 and parameters: {'num_topics': 8, 'alpha': 0.31, 'eta': 0.9099999999999999, 'chunksize': 1200, 'passes': 7, 'iterations': 350}. Best is trial 0 with value: 0.3864179810644952.[0m
[32m[I 2021-02-09 10:01:40,493][0m Trial 1 finished with value: 0.3044410561921815 and parameters: {'num_topics': 5, 'alpha': 0.01, 'eta': 0.61, 'chunksize': 1200, 'passes': 3, 'iterations': 100}. Best is trial 0 with value: 0.3864179810644952.[0m
[32m[I 2021-02-09 10:02:03,551][0m Trial 2 finished with value: 0.2885890052312071 and parameters: {'num_topics': 5, 'alpha': 0.9099999999999999, 'eta': 0.61, 'chunksize': 1300, 'passes': 7, 'iterations': 150}. Best is trial 0 with value: 0.3864179810644952.[0m
[32m[I 2021-02-09 10:02:30,064][0m Trial 3 finished with value: 0.2911753173800437 and parameters: {'num_topics': 3, 

In [7]:
# print(study.best_params)
# print(study.best_value)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print(optuna.importance.get_param_importances(study))

Study statistics: 
  Number of finished trials:  39
  Number of pruned trials:  12
  Number of complete trials:  27
Best trial:
  Value:  0.48458337778563054
  Params: 
    alpha: asymmetric
    chunksize: 900
    eta: 0.61
    iterations: 350
    num_topics: 8
    passes: 3
OrderedDict([('num_topics', 0.5260236281253498), ('iterations', 0.2480903186728618), ('eta', 0.09051292183973614), ('chunksize', 0.07207900047780061), ('alpha', 0.06121455239947173), ('passes', 0.002079578484779893)])
