## Topic models for tool  

 Here is code that creates topic models to be used for computing communities inside tool.  
 Five files are needed:
- topic_2.model
- topic_5.model
- 
- 
-   

In [11]:
# Imports
import numpy as np
import pandas as pd

import lzma
import pickle

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from collections import OrderedDict, Counter

import random

In [2]:
# Read data
responsories_all = pd.read_csv('../data/all-ci-responsories.csv', usecols=['cantus_id', 'incipit', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('../data/all-ci-antiphons.csv', usecols=['cantus_id', 'incipit', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('../data/sources-with-provenance-ids-and-two-centuries.csv', usecols=['provenance_id', 'drupal_path', 'siglum', 'cursus', 'num_century'])
feasts = pd.read_csv('../data/feast.csv', usecols=['id', 'name'])

chants = pd.concat([responsories_all, antiphons_all])

In [5]:
# Construct dict to index sources
source_dict = OrderedDict()
i = 0
for id in sources['drupal_path']:
    source_dict[id] = i
    i += 1

In [7]:
# Transform chant data into document like structure
source_all_chants_dict = {}
used_cantus_ids = []
for source_id in sources['drupal_path'].tolist():
    filt_source = chants['source_id'] == source_id
    used_cantus_ids += (chants[filt_source]['cantus_id']).tolist()
    source_all_chants_dict[source_id] = ' '.join((chants[filt_source]['cantus_id']).tolist())

print("Number of chants in our sources", len(used_cantus_ids))
freq_CIDs = Counter(used_cantus_ids)
print('Number of CIDs used in our sources', len(set(used_cantus_ids)))

Number of chants in our sources 362632
Number of CIDs used in our sources 17599


In [8]:
# Construct [sources x chants] matrix (document word matrix) for almost all data
all_count_vec = CountVectorizer(max_df=len(sources), min_df=0.0)
all_count_vec_data = all_count_vec.fit_transform(source_all_chants_dict.values())
print("Most wide [sources x chants] matrix shape:", all_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - all_count_vec_data.shape[1], "CIDs")

Most wide [sources x chants] matrix shape: (250, 17217)
Loosing 382 CIDs


In [9]:
# Construct [sources x chants] matrix (document word matrix) for choosen data
# we use only words which are in 250 or less docs and also it at least two docs
less_count_vec = CountVectorizer(max_df=250, min_df=2)
less_count_vec_data = less_count_vec.fit_transform(source_all_chants_dict.values())
print("Less wide [sources x chants] matrix shape:", less_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - less_count_vec_data.shape[1], "CIDs")

Less wide [sources x chants] matrix shape: (250, 10182)
Loosing 7417 CIDs


In [10]:
# Construct [sources x chants] matrix (document word matrix) for choosen data
# we use only words which are in 250 or less docs and also it at least eight docs
smallest_count_vec = CountVectorizer(max_df=250, min_df=8)
smallest_count_vec_data = smallest_count_vec.fit_transform(source_all_chants_dict.values())
print("The least wide [sources x chants] matrix shape:", smallest_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - smallest_count_vec_data.shape[1], "CIDs")

The least wide [sources x chants] matrix shape: (250, 4925)
Loosing 12674 CIDs


### Creating models

#### Dimension reduction

In [14]:
# Model for dimension reduction and further counting distance between chants sets
# uses all_count_vec, max_iter = 40, evaluate_every = 1, 20 topics
random_state_red = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_20 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=20, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_red[i])
    model.fit(all_count_vec_data)
    compare_data = all_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_20[i] = model.perplexity(compare_data)

iteration: 1 of max_iter: 40, perplexity: 14723.2959
iteration: 2 of max_iter: 40, perplexity: 6959.8877
iteration: 3 of max_iter: 40, perplexity: 5819.2489
iteration: 4 of max_iter: 40, perplexity: 5418.7358
iteration: 5 of max_iter: 40, perplexity: 5182.2591
iteration: 6 of max_iter: 40, perplexity: 5043.8578
iteration: 7 of max_iter: 40, perplexity: 4960.7202
iteration: 8 of max_iter: 40, perplexity: 4902.9725
iteration: 9 of max_iter: 40, perplexity: 4858.0262
iteration: 10 of max_iter: 40, perplexity: 4819.1409
iteration: 11 of max_iter: 40, perplexity: 4786.1536
iteration: 12 of max_iter: 40, perplexity: 4757.8748
iteration: 13 of max_iter: 40, perplexity: 4733.0729
iteration: 14 of max_iter: 40, perplexity: 4711.4471
iteration: 15 of max_iter: 40, perplexity: 4692.2730
iteration: 16 of max_iter: 40, perplexity: 4675.3656
iteration: 17 of max_iter: 40, perplexity: 4660.6230
iteration: 18 of max_iter: 40, perplexity: 4648.1186
iteration: 19 of max_iter: 40, perplexity: 4636.6838
i

In [20]:
print(min(perplexities_20.values()))
print(perplexities_20)

50671.66685336492
{0: 68941.3903679552, 1: 56462.1555912349, 2: 57817.28057275, 3: 62472.2605989467, 4: 56058.7955533459, 5: 55256.06188335232, 6: 51236.081307034394, 7: 50671.66685336492, 8: 52963.240551489755, 9: 70547.5488082292}


In [22]:
model_20 = LatentDirichletAllocation(n_components=20, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_red[7])
model_20.fit(all_count_vec_data)

iteration: 1 of max_iter: 40, perplexity: 14172.9108
iteration: 2 of max_iter: 40, perplexity: 6402.0810
iteration: 3 of max_iter: 40, perplexity: 5514.1683
iteration: 4 of max_iter: 40, perplexity: 5191.7340
iteration: 5 of max_iter: 40, perplexity: 5027.2402
iteration: 6 of max_iter: 40, perplexity: 4929.9603
iteration: 7 of max_iter: 40, perplexity: 4864.2240
iteration: 8 of max_iter: 40, perplexity: 4819.3117
iteration: 9 of max_iter: 40, perplexity: 4786.1268
iteration: 10 of max_iter: 40, perplexity: 4758.6969
iteration: 11 of max_iter: 40, perplexity: 4735.1273
iteration: 12 of max_iter: 40, perplexity: 4716.0356
iteration: 13 of max_iter: 40, perplexity: 4700.0971
iteration: 14 of max_iter: 40, perplexity: 4685.3891
iteration: 15 of max_iter: 40, perplexity: 4671.1974
iteration: 16 of max_iter: 40, perplexity: 4657.6745
iteration: 17 of max_iter: 40, perplexity: 4645.4670
iteration: 18 of max_iter: 40, perplexity: 4633.9463
iteration: 19 of max_iter: 40, perplexity: 4622.9689
i

In [23]:
with lzma.open('topic_20.model', "wb") as model_file:
    pickle.dump(all_count_vec, model_file)
    pickle.dump(model_20, model_file)

In [24]:
with lzma.open('topic_reduction.model', "wb") as model_file:
    pickle.dump(all_count_vec, model_file)
    pickle.dump(model_20, model_file)

#### 2 topics

In [26]:
random_state_2 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_2 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=2, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_2[i])
    model.fit(smallest_count_vec_data)
    compare_data = smallest_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_2[i] = model.perplexity(compare_data)

iteration: 1 of max_iter: 40, perplexity: 3758.4620
iteration: 2 of max_iter: 40, perplexity: 3475.2840
iteration: 3 of max_iter: 40, perplexity: 3339.1514
iteration: 4 of max_iter: 40, perplexity: 3281.5652
iteration: 5 of max_iter: 40, perplexity: 3251.9288
iteration: 6 of max_iter: 40, perplexity: 3235.2517
iteration: 7 of max_iter: 40, perplexity: 3225.4751
iteration: 8 of max_iter: 40, perplexity: 3219.5626
iteration: 9 of max_iter: 40, perplexity: 3215.9005
iteration: 10 of max_iter: 40, perplexity: 3213.6094
iteration: 11 of max_iter: 40, perplexity: 3212.1545
iteration: 12 of max_iter: 40, perplexity: 3211.2032
iteration: 13 of max_iter: 40, perplexity: 3210.5549
iteration: 14 of max_iter: 40, perplexity: 3210.0922
iteration: 15 of max_iter: 40, perplexity: 3209.7464
iteration: 16 of max_iter: 40, perplexity: 3209.4774
iteration: 17 of max_iter: 40, perplexity: 3209.2616
iteration: 18 of max_iter: 40, perplexity: 3209.0842
iteration: 19 of max_iter: 40, perplexity: 3208.9359
it

In [27]:
print(min(perplexities_2.values()))
print(perplexities_2)

5238.684242074966
{0: 5246.931198293515, 1: 5246.441646363305, 2: 5240.825302730147, 3: 5247.713893560956, 4: 5238.684242074966, 5: 5239.304627888648, 6: 5247.126463549619, 7: 5239.580111133097, 8: 5246.725748242739, 9: 5246.298069758841}


In [32]:
model_2 = LatentDirichletAllocation(n_components=2, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_2[4])
model_2.fit(smallest_count_vec_data)

iteration: 1 of max_iter: 40, perplexity: 3761.6813
iteration: 2 of max_iter: 40, perplexity: 3498.0755
iteration: 3 of max_iter: 40, perplexity: 3348.6009
iteration: 4 of max_iter: 40, perplexity: 3288.5195
iteration: 5 of max_iter: 40, perplexity: 3257.9340
iteration: 6 of max_iter: 40, perplexity: 3240.4411
iteration: 7 of max_iter: 40, perplexity: 3229.8207
iteration: 8 of max_iter: 40, perplexity: 3223.1764
iteration: 9 of max_iter: 40, perplexity: 3218.9323
iteration: 10 of max_iter: 40, perplexity: 3216.1622
iteration: 11 of max_iter: 40, perplexity: 3214.3050
iteration: 12 of max_iter: 40, perplexity: 3213.0275
iteration: 13 of max_iter: 40, perplexity: 3212.1241
iteration: 14 of max_iter: 40, perplexity: 3211.4639
iteration: 15 of max_iter: 40, perplexity: 3210.9634
iteration: 16 of max_iter: 40, perplexity: 3210.5714
iteration: 17 of max_iter: 40, perplexity: 3210.2561
iteration: 18 of max_iter: 40, perplexity: 3209.9974
iteration: 19 of max_iter: 40, perplexity: 3209.7819
it

In [33]:
with lzma.open('topic_2.model', "wb") as model_file:
    pickle.dump(smallest_count_vec, model_file)
    pickle.dump(model_2, model_file)

#### 5 topics

In [30]:
random_state_5 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_5 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=5, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_5[i])
    model.fit(smallest_count_vec_data)
    compare_data = smallest_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_5[i] = model.perplexity(compare_data)

iteration: 1 of max_iter: 40, perplexity: 4078.6343
iteration: 2 of max_iter: 40, perplexity: 3495.8005
iteration: 3 of max_iter: 40, perplexity: 3272.2593
iteration: 4 of max_iter: 40, perplexity: 3192.9319
iteration: 5 of max_iter: 40, perplexity: 3152.3063
iteration: 6 of max_iter: 40, perplexity: 3127.6380
iteration: 7 of max_iter: 40, perplexity: 3111.2907
iteration: 8 of max_iter: 40, perplexity: 3099.3121
iteration: 9 of max_iter: 40, perplexity: 3089.4704
iteration: 10 of max_iter: 40, perplexity: 3080.9190
iteration: 11 of max_iter: 40, perplexity: 3072.9314
iteration: 12 of max_iter: 40, perplexity: 3065.0279
iteration: 13 of max_iter: 40, perplexity: 3056.8814
iteration: 14 of max_iter: 40, perplexity: 3049.1309
iteration: 15 of max_iter: 40, perplexity: 3041.7710
iteration: 16 of max_iter: 40, perplexity: 3034.6918
iteration: 17 of max_iter: 40, perplexity: 3027.8952
iteration: 18 of max_iter: 40, perplexity: 3021.6124
iteration: 19 of max_iter: 40, perplexity: 3016.0197
it

In [31]:
print(min(perplexities_5.values()))
print(perplexities_5)

6888.239625476936
{0: 7218.211843482752, 1: 6888.239625476936, 2: 7582.03236875475, 3: 7089.943291275749, 4: 7159.575814101979, 5: 7065.411632134728, 6: 7638.453724089115, 7: 7005.880118646162, 8: 7328.532847699104, 9: 6947.402025717157}


In [34]:
model_5 = LatentDirichletAllocation(verbose=1, n_components=5, max_iter=40, evaluate_every=1, random_state=random_state_5[1])
model_5.fit(smallest_count_vec_data)

iteration: 1 of max_iter: 40, perplexity: 4098.7233
iteration: 2 of max_iter: 40, perplexity: 3451.4506
iteration: 3 of max_iter: 40, perplexity: 3250.9777
iteration: 4 of max_iter: 40, perplexity: 3174.8376
iteration: 5 of max_iter: 40, perplexity: 3125.2608
iteration: 6 of max_iter: 40, perplexity: 3089.8911
iteration: 7 of max_iter: 40, perplexity: 3064.4630
iteration: 8 of max_iter: 40, perplexity: 3045.9158
iteration: 9 of max_iter: 40, perplexity: 3032.0284
iteration: 10 of max_iter: 40, perplexity: 3021.6253
iteration: 11 of max_iter: 40, perplexity: 3013.9221
iteration: 12 of max_iter: 40, perplexity: 3008.2128
iteration: 13 of max_iter: 40, perplexity: 3003.8273
iteration: 14 of max_iter: 40, perplexity: 3000.3021
iteration: 15 of max_iter: 40, perplexity: 2997.3374
iteration: 16 of max_iter: 40, perplexity: 2994.8058
iteration: 17 of max_iter: 40, perplexity: 2992.6580
iteration: 18 of max_iter: 40, perplexity: 2990.8058
iteration: 19 of max_iter: 40, perplexity: 2989.2475
it

In [35]:
with lzma.open('topic_5.model', "wb") as model_file:
    pickle.dump(smallest_count_vec, model_file)
    pickle.dump(model_5, model_file)

#### 10 topics

In [36]:
random_state_10 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_10 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=10, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_10[i])
    model.fit(all_count_vec_data)
    compare_data = all_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_10[i] = model.perplexity(compare_data)

iteration: 1 of max_iter: 40, perplexity: 8667.3210
iteration: 2 of max_iter: 40, perplexity: 5982.8738
iteration: 3 of max_iter: 40, perplexity: 5291.9804
iteration: 4 of max_iter: 40, perplexity: 5039.2865
iteration: 5 of max_iter: 40, perplexity: 4919.1511
iteration: 6 of max_iter: 40, perplexity: 4844.8331
iteration: 7 of max_iter: 40, perplexity: 4789.3444
iteration: 8 of max_iter: 40, perplexity: 4742.0891
iteration: 9 of max_iter: 40, perplexity: 4703.1405
iteration: 10 of max_iter: 40, perplexity: 4674.2048
iteration: 11 of max_iter: 40, perplexity: 4654.0379
iteration: 12 of max_iter: 40, perplexity: 4639.2225
iteration: 13 of max_iter: 40, perplexity: 4627.8718
iteration: 14 of max_iter: 40, perplexity: 4618.9106
iteration: 15 of max_iter: 40, perplexity: 4611.3047
iteration: 16 of max_iter: 40, perplexity: 4604.6420
iteration: 17 of max_iter: 40, perplexity: 4598.8926
iteration: 18 of max_iter: 40, perplexity: 4593.7089
iteration: 19 of max_iter: 40, perplexity: 4588.8356
it

In [38]:
print(min(perplexities_10.values()))
print(perplexities_10)

32454.00100416716
{0: 36717.773841535694, 1: 38346.96888843639, 2: 33802.883321747344, 3: 32701.389209965633, 4: 32454.00100416716, 5: 39020.47740794555, 6: 32782.88021174183, 7: 33098.64514322037, 8: 33206.062298654455, 9: 36838.030473124265}


In [39]:
model_10 = LatentDirichletAllocation(verbose=1, n_components=10, max_iter=40, evaluate_every=1, random_state=random_state_10[4])
model_10.fit(all_count_vec_data)

iteration: 1 of max_iter: 40, perplexity: 8594.3044
iteration: 2 of max_iter: 40, perplexity: 5925.6052
iteration: 3 of max_iter: 40, perplexity: 5315.3832
iteration: 4 of max_iter: 40, perplexity: 5068.9928
iteration: 5 of max_iter: 40, perplexity: 4935.5650
iteration: 6 of max_iter: 40, perplexity: 4849.7713
iteration: 7 of max_iter: 40, perplexity: 4788.5546
iteration: 8 of max_iter: 40, perplexity: 4744.5983
iteration: 9 of max_iter: 40, perplexity: 4715.5858
iteration: 10 of max_iter: 40, perplexity: 4696.8813
iteration: 11 of max_iter: 40, perplexity: 4684.4792
iteration: 12 of max_iter: 40, perplexity: 4675.0044
iteration: 13 of max_iter: 40, perplexity: 4667.6549
iteration: 14 of max_iter: 40, perplexity: 4661.9138
iteration: 15 of max_iter: 40, perplexity: 4657.0133
iteration: 16 of max_iter: 40, perplexity: 4652.7678
iteration: 17 of max_iter: 40, perplexity: 4649.0150
iteration: 18 of max_iter: 40, perplexity: 4645.6545
iteration: 19 of max_iter: 40, perplexity: 4642.5736
it

In [40]:
with lzma.open('topic_10.model', "wb") as model_file:
    pickle.dump(all_count_vec, model_file)
    pickle.dump(model_10, model_file)