Notebook for training topic model on chant data  
That topic model will possibly be used for reduction of dimension of [sources x chants] matrix
and as projection to new space for distance counting

* sklearn.decomposition.LatentDirichletAlocation

In [1]:
# Imports
import numpy as np
import pandas as pd

import lzma
import pickle

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter

In [48]:
# Read data
responsories_all = pd.read_csv('../data/all-ci-responsories.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('../data/all-ci-antiphons.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('../data/sources-with-provenance-ids-and-two-centuries.csv', usecols=['provenance_id', 'drupal_path'])
feasts = pd.read_csv('../data/feast.csv', usecols=['id', 'name'])

chants = pd.concat([responsories_all, antiphons_all])

In [49]:
# Add info about cursus
row_sources = pd.read_csv('../data/sources-of-all-ci-antiphons_OPTIONAL-CENTURY.CSV', usecols=['drupal_path', 'cursus'])
row_sources['cursus'] = row_sources['cursus'].fillna('Unknown')
sources = pd.merge(left=sources, right=row_sources, how='inner')

791
791
224
224


In [3]:
# Transform chant data into document like structure
source_chants_dict = {}
for source_id in sources['drupal_path'].tolist():
    filt_source = chants['source_id'] == source_id
    source_chants_dict[source_id] = ' '.join((chants[filt_source]['cantus_id']).tolist())

In [4]:
# Construct [sources x chants] matrix (document word matrix)
count_vec = CountVectorizer(max_df=1, min_df=1)
count_vec_data = count_vec.fit_transform(source_chants_dict.values())


In [5]:
print(len(count_vec.get_feature_names_out()))

6886


In [6]:
# LDA for 5 topics
lda_model_5 = LatentDirichletAllocation(n_components=5)
lda_model_5.fit(count_vec_data)

In [7]:
for index,topic in enumerate(lda_model_5.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([count_vec.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['205119', '203951', '601870', '204035', '207066', '603121', '205070', 'a02290', 'a01486', '203860', '205835', '602783', '006073', '200286', '205823']


THE TOP 15 WORDS FOR TOPIC #1
['602109', '600965', '601181', '202151', '602220', '200850', '200233', '600224', '203036', '601942', '204430', '200234', '204432', '204543', '204431']


THE TOP 15 WORDS FOR TOPIC #2
['007610', '204746', '206803', '605055', '203286', '601044', '007845', '002124', '206671', '603053', '206660', '205089', '601865', '201212', '206607']


THE TOP 15 WORDS FOR TOPIC #3
['202509', '006134', '602402', '201438', '006085', 'a06353', '204169', '601789', '007602', '601397', '601629', '201896', '602354', '201728', 'a01487']


THE TOP 15 WORDS FOR TOPIC #4
['600128', '600047', 'a07394', 'a07395', '006247', 'a07557', 'a07619', 'a07553', 'a07551', '202324', '201817', '202469', '002905', 'a07552', '600714']




In [8]:
# Get results out
topic_results_5 = lda_model_5.transform(count_vec_data)
topics_5 = topic_results_5.argmax(axis=1)
print(topics_5)

[2 2 0 1 0 0 4 0 4 0 0 4 3 1 3 3 1 2 1 4 3 2 0 4 0 0 0 1 0 3 0 4 0 4 3 0 0
 2 1 0 4 3 4 3 1 3 4 0 4 1 2 2 3 0 0 4 1 4 0 4 3 2 0 0 4 2 1 1 2 4 1 2 3 2
 4 1 4 0 3 0 0 0 4 0 0 1 0 2 2 3 1 1 0 0 1 3 0 0 1 0 2 2 3 0 0 3 0 4 0 4 2
 0 0 0 0 2 4 0 0 4 1 0 0 1 0 1 1 0 0 3 3 1 3 0 0 0 0 4 0 1 2 1 2 2 1 2 1 3
 4 1 4 2 2 1 3 2 2 0 4 0 0 0 0 1 3 0 1 0 4 3 0 1 0 0 0 0 4 3 3 0 3 2 2 0 1
 1 1 0 0 0 0 2 0 4 3 2 0 3 0 4 4 4 0 2 1 0 1 3 0 3 1 4 4 4 0 0 1 0 1 4 3 4
 2 0]


In [24]:
# Labels distribution
freq_5 = Counter(topics_5)
print(freq_5.most_common())

[(0, 82), (1, 40), (4, 38), (2, 32), (3, 32)]


In [10]:
MODEL_FILE = 'topic_model_5.model'

In [11]:
# Serialize our models
with lzma.open(MODEL_FILE, "wb") as model_file:
    pickle.dump(count_vec, model_file)
    pickle.dump(lda_model_5, model_file)

In [12]:
# Deserialization
with lzma.open(MODEL_FILE, "rb") as model_file:
    trans = pickle.load(model_file)
    model = pickle.load(model_file)

In [13]:
# LDA for 20 topics
lda_model_20 = LatentDirichletAllocation(n_components=20)
lda_model_20.fit(count_vec_data)

In [66]:
# Get results out
topic_results_20 = lda_model_20.transform(count_vec_data)
topics_20 = topic_results_20.argmax(axis=1)
print(topics_20)

[14 10  9  8 19  4 19  0  1  0 18  7  2 17 12  4 19  6  0 10  4 10  0 13
  0  0  0  1  2 11  0  4  9 11  3  1  0 12 13  9 15 11 12  1 18  9 17  2
 17  4  3  6 12 12 14 13 11 17  0 19 12  6  0  4 13  1  3 17 18  5 13 15
 16  7 17 17 15  0 14 19  0  5 14  0 15 18 10 16  8 11 11 19 15  8 11  1
 13  0  0 18 15 15 11 10  5  0  5  2  0 14  8  0  0 15  0 11  7  0  0 11
 14  0  0 16  2  5 15  8  7 17  4  3  5  0  0 10  0 13  7 19  7 15 14  1
 11 14 13 12  1 15  2  5 19 18  7 10 12  0  2  0  0  0  0  7 11  5  4  0
 10  0  0 16  7  0  0  0  5  6 13  4 15  0 15  0 13  0 15  0  0  0  0 13
  0  9  2 13  0 10  0 17 17 14  4  6 10 16  6 17  0 10  6  5 17  5  0  0
 12  0  6 16 17  7 13  3]


In [30]:
# Print the topics with their terms
lda_20_components = lda_model_20.components_
terms = count_vec.get_feature_names_out() 
for index, component in enumerate(lda_20_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['601942', '600965', '601181', '602109', '201775']
Topic 1:  ['002124', '007845', '007610', '202010', '204746']
Topic 2:  ['a07394', 'a07395', '206496', '605028', 'a01217']
Topic 3:  ['203655', '204837', '205007', '601234', '601580']
Topic 4:  ['600714', '201817', '202469', '204169', '200316']
Topic 5:  ['200548', '602029', 'a06537', 'a06575', 'a07104']
Topic 6:  ['601865', '201016', '200588', '201463', '202351']
Topic 7:  ['206607', '206660', '206671', '603053', '206803']
Topic 8:  ['201316', '204418', '601501', '601713', 'a05954']
Topic 9:  ['200286', '203860', '200167', '200202', '200227']
Topic 10:  ['205089', '203214', '204147', '204260', 'a06353']
Topic 11:  ['a01487', '007602', '201728', '201896', '601397']
Topic 12:  ['201212', '601044', '002905', '005201', '202932']
Topic 13:  ['202324', '206293', '206294', '206295', '206296']
Topic 14:  ['203286', 'a07552', 'a07551', 'a07553', 'a07557']
Topic 15:  ['006073', 'a02055', 'a02082', '006303', 'a07245']
Topic 16:  ['00129

In [23]:
# Labels distribution
freq_20 = Counter(topics_20)
print(freq_20.most_common())

[(0, 53), (15, 14), (17, 13), (13, 13), (11, 12), (10, 11), (5, 11), (4, 10), (7, 10), (14, 9), (12, 9), (19, 8), (1, 8), (2, 8), (6, 8), (18, 6), (16, 6), (9, 5), (8, 5), (3, 5)]


In [22]:
# LDA for 2 topics
lda_model_2 = LatentDirichletAllocation(n_components=2)
lda_model_2.fit(count_vec_data)

In [59]:
# Get results out
topic_results_2 = lda_model_2.transform(count_vec_data)
topics_2 = topic_results_2.argmax(axis=1)
print(topics_2)

[0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0
 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1
 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0
 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1
 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0
 0 0]


In [58]:
labeled_docs = [[], []]
for labeled in zip(topics_2, sources['drupal_path'].tolist()):
    labeled_docs[labeled[0]].append(sources[sources['drupal_path'] == labeled[1]]['cursus'].values[0])

In [57]:
print(len(labeled_docs[0]), Counter(labeled_docs[0]).most_common())
print(len(labeled_docs[1]), Counter(labeled_docs[1]).most_common())

139 [('Secular', 66), ('Unknown', 51), ('Monastic', 20), ('Romanum', 2)]
85 [('Secular', 40), ('Unknown', 25), ('Monastic', 19), ('Romanum', 1)]
