# Initialization

In [1]:
import os
import re
import numpy as np
import random
import time
import json

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

### set experiment dates

In [2]:
year = 2005 

### set run

In [3]:
run = 27

In [4]:
run_prefix = 'run_%s_xx' % run

### set directories

In [5]:
dir_root = os.path.join('..')
dir_data_runs = os.path.join(dir_root, 'data_runs')
dir_run = os.path.join(dir_data_runs, run_prefix)
dir_reports =  os.path.join(dir_run, 'reports')
dir_plots = os.path.join(dir_run, 'interpretation', 't_search_plots')

In [6]:
if not os.path.exists(dir_plots):
    os.makedirs(dir_plots)

### set flags

In [7]:
flag_debug = False
flag_serialize_findings = False
flag_print_tables = False

# Read data reports from files

In [8]:
def read_report(dir_name, file_name):
    file_path = os.path.join(dir_name, file_name)
    result = ''
    with open(file_path, 'r') as f_r:
        for text_line in f_r:
            result = result + text_line.strip()
    return result        

In [9]:
reports_list = []

regExp = re.compile('[A-Z\d]+\_[A-Z\d]+\-' + str(year) + '\_[\d]+\.txt$')
reports_of_year = [f for f in os.listdir(dir_reports) if re.search(regExp, f)]
reports_of_year.sort()
# for every reports of the year
for report_name in reports_of_year:
    reports_list.append(read_report(dir_reports, report_name))

print('Done on %s, number of documents: %s' % (year, len(reports_of_year)))

Done on 2005, number of documents: 234


In [10]:
reports_list[0][:30]

'mining headline platinum diamo'

# Corpus construction

In [11]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

## Create Corpus

Shuffle reporst

In [12]:
seed = 99

Best randon seed is 99

In [13]:
random.Random(seed).shuffle(reports_list)

Split between training and test set

In [14]:
testing_size = len(reports_list) // 10
training_size = len(reports_list) - testing_size

In [15]:
print('Total %s reprorts' % len(reports_list))
print('Training %s reprorts' % training_size)
print('Testing %s reprorts' % testing_size)

Total 234 reprorts
Training 211 reprorts
Testing 23 reprorts


## Create vectorize reports 

In [16]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=1,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

In [17]:
reports_vectorized_training = vectorizer.fit_transform(reports_list[:training_size])
reports_vectorized_testing = vectorizer.fit_transform(reports_list[:-testing_size])

# Find optimal topic number

In [18]:
def scan_T_sklearn(training_corpus, 
                   test_corpus, 
                   limit, 
                   start=2, 
                   step=3, 
                   passes=10, 
                   random_state=100):
    
    train_p_values = []
    test_p_values = []
    model_list = []

    i = 0
    
    for num_topics in range(start, limit + 1, step):

        lda_model = LatentDirichletAllocation(batch_size=128,
              doc_topic_prior=None,
              evaluate_every=-1, 
              learning_decay=0.7,
              learning_method='online',
              learning_offset=10.0,
              max_doc_update_iter=100,
              max_iter=passes, 
              mean_change_tol=0.001,
              n_components=num_topics,
              n_jobs=-1,
              perp_tol=0.1,
              random_state=random_state,
              topic_word_prior=None,
              total_samples=1000000.0,
              verbose=0)
        
        model = lda_model.fit_transform(training_corpus)

        model_list.append(model)
                
        train_perplexity = lda_model.perplexity(training_corpus)
        train_p_values.append(train_perplexity)
        
        test_perplexity = lda_model.perplexity(test_corpus)
        test_p_values.append(test_perplexity)
        
        print('Done on indx: %s T: %s, Test perplexity: %s' % (i, num_topics, test_perplexity))
        
        i += 1
        
    
    return model_list, train_p_values, test_p_values

In [19]:
min_T = 1
max_T = 100
step_T = 1
passes_T = 51
random_state = 193748

### Best run params:

- min_T = 1
- max_T = 100
- step_T = 1
- passes_T = 51
- random_state = 193748

In [20]:
model_list = None
train_perplexity_values = None
test_perplexity_values = None

Run with heldout test set

In [None]:
start = time.time()

if True:
    model_list, train_perplexity_values, test_perplexity_values = scan_T_sklearn(
        training_corpus=reports_vectorized_training, 
        test_corpus=reports_vectorized_testing,
        start=min_T, 
        limit=max_T, 
        step=step_T,
        passes=passes_T,
        random_state=random_state)
    
end = time.time()

print('Execution time: %s minutes' % (end - start)/60)

Done on indx: 0 T: 1, Test perplexity: 3056.28085984862
Done on indx: 1 T: 2, Test perplexity: 2469.5003018620955
Done on indx: 2 T: 3, Test perplexity: 2180.813524410251
Done on indx: 3 T: 4, Test perplexity: 1995.437053477579
Done on indx: 4 T: 5, Test perplexity: 1846.5647635321268
Done on indx: 5 T: 6, Test perplexity: 1679.4916970156414
Done on indx: 6 T: 7, Test perplexity: 1566.083050111676
Done on indx: 7 T: 8, Test perplexity: 1496.421698683583
Done on indx: 8 T: 9, Test perplexity: 1430.635875790689
Done on indx: 9 T: 10, Test perplexity: 1347.7329497769658
Done on indx: 10 T: 11, Test perplexity: 1314.1259852339397
Done on indx: 11 T: 12, Test perplexity: 1289.7431908536016
Done on indx: 12 T: 13, Test perplexity: 1309.7411195588218
Done on indx: 13 T: 14, Test perplexity: 1220.6905613614665
Done on indx: 14 T: 15, Test perplexity: 1239.0359361599806
Done on indx: 15 T: 16, Test perplexity: 1207.0335155374983
Done on indx: 16 T: 17, Test perplexity: 1262.1486047463131




Done on indx: 17 T: 18, Test perplexity: 1147.9825915943968
Done on indx: 18 T: 19, Test perplexity: 1140.2421563423566




Done on indx: 19 T: 20, Test perplexity: 1176.6691187449162




Done on indx: 20 T: 21, Test perplexity: 1180.2194610156791




Done on indx: 21 T: 22, Test perplexity: 1152.0914542522469




In [None]:
import math
for i in range(len(coherence_values)):
    if math.isnan(coherence_values[i]):
        coherence_values[i] = 0

### serialize findings

In [None]:
scat_T_results = {
    'train_perplexity_values': train_perplexity_values,
    'test_perplexity_values': test_perplexity_values,
}

if flag_serialize_findings:
    with open(os.path.join(dir_run, 'scan_T_values_sklearn.json'), 'w') as f_w:
        f_w.write(json.dumps(scat_T_results))

# Results

In [None]:
best_T_on_perplexity = None


In [None]:
topics_range = np.array(range(min_T, max_T + 1, step_T))
best_T_on_perplexity = topics_range[np.argmin(train_perplexity_values)]


In [None]:
color_coherence = '#1f94f9'#hsl(208, 95%, 55%) # #1f94f9
color_perplexity = '#f9591f' #hsl(16, 95%, 55%) # #f9591f

## Plot

In [None]:
def plot_topics_metric(ax, data, metric = '', label = '', color = 'red', best_is_max = True, with_interest_area = False, interest_area_center = 0, interest_area_size = 5):
    x = np.array(range(min_T, max_T + 1, step_T))
    y = np.array(data)

    ax.set_xlabel('Number of Topics, T')
    ax.set_ylabel(metric, color=color)
    ax.plot(x, y, color=color, label = label)   
    
    x_best = None
    y_best = None
    if with_interest_area:
        interest_area_min = interest_area_center - interest_area_size;
        interest_area_max = interest_area_center + interest_area_size;
        ax.axvspan(interest_area_min, interest_area_max, color='green', alpha=0.2) 

        x_best = interest_area_min + np.argmax(y[interest_area_min - min_T : (interest_area_max + 1) - min_T])
        y_best = y[interest_area_min - min_T : (interest_area_max + 1) - min_T].max()
    else: 
        if best_is_max:
            x_best = x[np.argmax(y)]
            y_best = y.max()
        else:
            x_best = x[np.argmin(y)]
            y_best = y.min()            

    text = 'Num of Topics=%s, score=%f' % (x_best, y_best)

    bbox_props = dict(boxstyle='square,pad=0.3', fc='w', ec='k', lw=0.72)
    arrowprops=dict(arrowstyle='->')
    kw = dict(xycoords='data',textcoords='axes fraction',
          arrowprops=arrowprops, bbox=bbox_props, ha='right', va='top')
    
    ax.annotate(text, xy=(x_best, y_best), xytext=(0.99,1.1), **kw)


### Plot Perplexity test and training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10,5))

plot_topics_metric(
    axs[0], 
    train_perplexity_values,
    metric = 'Perplexity',
    label = 'Training Perplexity',
    color = color_perplexity,
    best_is_max = False
)
plot_topics_metric(
    axs[1], 
    test_perplexity_values,
    metric = 'Perplexity',
    label = 'Test Perplexity',
    color = color_perplexity,
    best_is_max = False
)

plt.show()
plt.close()

# END