## Topic models for tool  

 Here is code that creates topic models to be used for computing communities inside tool.  
 Five files are needed:
- topic_2.model -> smallest_count_vec
- topic_5.model -> less_count_vec
- topic_10.model -> less_count_vec
- topic_20.model -> less_count_vec
- topic_reduction.model (copy of topic_20.model)

In [1]:
# Imports
import numpy as np
import pandas as pd

import lzma
import pickle

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from collections import OrderedDict, Counter

import random

In [2]:
# Read data
responsories_all = pd.read_csv('../data/all-ci-responsories.csv', usecols=['cantus_id', 'incipit', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('../data/all-ci-antiphons.csv', usecols=['cantus_id', 'incipit', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('../data/sources-with-provenance-ids-and-two-centuries.csv', usecols=['provenance_id', 'drupal_path', 'siglum', 'cursus', 'num_century'])
feasts = pd.read_csv('../data/feast.csv', usecols=['id', 'name'])

chants = pd.concat([responsories_all, antiphons_all])

In [3]:
# Construct dict to index sources
source_dict = OrderedDict()
i = 0
for id in sources['drupal_path']:
    source_dict[id] = i
    i += 1

In [4]:
# Transform chant data into document like structure
source_all_chants_dict = {}
used_cantus_ids = []
for source_id in sources['drupal_path'].tolist():
    filt_source = chants['source_id'] == source_id
    used_cantus_ids += (chants[filt_source]['cantus_id']).tolist()
    source_all_chants_dict[source_id] = ' '.join((chants[filt_source]['cantus_id']).tolist())

print("Number of chants in our sources", len(used_cantus_ids))
freq_CIDs = Counter(used_cantus_ids)
print('Number of CIDs used in our sources', len(set(used_cantus_ids)))

Number of chants in our sources 362632
Number of CIDs used in our sources 17599


In [5]:
# Construct [sources x chants] matrix (document word matrix) for almost all data
all_count_vec = CountVectorizer(max_df=len(sources), min_df=0.0, token_pattern='\\b(\\w+[\\.:]?\\w+)\\b')
all_count_vec_data = all_count_vec.fit_transform(source_all_chants_dict.values())
print("Most wide [sources x chants] matrix shape:", all_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - all_count_vec_data.shape[1], "CIDs")

Most wide [sources x chants] matrix shape: (250, 17599)
Loosing 0 CIDs


In [6]:
# Construct [sources x chants] matrix (document word matrix) for choosen data
# we use only words which are in 250 or less docs and also it at least two docs
less_count_vec = CountVectorizer(max_df=250, min_df=2, token_pattern='\\b(\\w+[\\.:]?\\w+)\\b')
less_count_vec_data = less_count_vec.fit_transform(source_all_chants_dict.values())
print("Less wide [sources x chants] matrix shape:", less_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - less_count_vec_data.shape[1], "CIDs")

Less wide [sources x chants] matrix shape: (250, 10368)
Loosing 7231 CIDs


In [7]:
# Construct [sources x chants] matrix (document word matrix) for choosen data
# we use only words which are in 250 or less docs and also it at least eight docs
smallest_count_vec = CountVectorizer(max_df=250, min_df=8, token_pattern='\\b(\\w+[\\.:]?\\w+)\\b')
smallest_count_vec_data = smallest_count_vec.fit_transform(source_all_chants_dict.values())
print("The least wide [sources x chants] matrix shape:", smallest_count_vec_data.shape)
print("Loosing", len(set(used_cantus_ids)) - smallest_count_vec_data.shape[1], "CIDs")

The least wide [sources x chants] matrix shape: (250, 4924)
Loosing 12675 CIDs


### Creating models

#### Dimension reduction

In [13]:
# Model for dimension reduction and further counting distance between chants sets
# uses all_count_vec, max_iter = 40, evaluate_every = 1, 20 topics
random_state_red = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_20 = {}
for i in range(10):
    print(i)
    model = LatentDirichletAllocation(n_components=20, evaluate_every=1, max_iter=40, random_state=random_state_red[i])
    model.fit(less_count_vec_data)
    compare_data = less_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_20[i] = model.perplexity(compare_data)

0
1
2
3
4
5
6
7
8
9


In [14]:
print(min(perplexities_20.values()))
print(perplexities_20)

31769.229719598163
{0: 39828.7980955025, 1: 36856.97205927947, 2: 33994.29272628802, 3: 33598.63686882714, 4: 44800.10066494636, 5: 38153.11834538742, 6: 31853.28652568756, 7: 31769.229719598163, 8: 41968.79895148937, 9: 34329.50208995581}


In [None]:
model_20 = LatentDirichletAllocation(n_components=20, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_red[7])
model_20.fit(less_count_vec_data)

In [15]:
#with lzma.open('topic_20.model', "wb") as model_file:
#    pickle.dump(all_count_vec, model_file)
#    pickle.dump(model_20, model_file)

In [16]:
#with lzma.open('topic_reduction.model', "wb") as model_file:
#    pickle.dump(all_count_vec, model_file)
#    pickle.dump(model_20, model_file)

#### 2 topics

In [None]:
random_state_2 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_2 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=2, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_2[i])
    model.fit(smallest_count_vec_data)
    compare_data = smallest_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_2[i] = model.perplexity(compare_data)

In [9]:
print(min(perplexities_2.values()))
print(perplexities_2)

5236.587591633831
{0: 5243.6288023577345, 1: 5237.336296994617, 2: 5236.587591633831, 3: 5244.458623793637, 4: 5243.340633690007, 5: 5236.703858324916, 6: 5242.975047236425, 7: 5243.9313452122715, 8: 5243.016821268301, 9: 5243.149331856894}


In [None]:
model_2 = LatentDirichletAllocation(n_components=2, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_2[2])
model_2.fit(smallest_count_vec_data)

In [11]:
#with lzma.open('topic_2.model', "wb") as model_file:
#    pickle.dump(smallest_count_vec, model_file)
#    pickle.dump(model_2, model_file)

#### 5 topics

In [None]:
random_state_5 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_5 = {}
for i in range(10):
    model = LatentDirichletAllocation(n_components=5, verbose=1, evaluate_every=1, max_iter=40, random_state=random_state_5[i])
    model.fit(less_count_vec_data)
    compare_data = smallest_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_5[i] = model.perplexity(compare_data)

In [13]:
print(min(perplexities_5.values()))
print(perplexities_5)

51478.15555992069
{0: 51478.15555992069, 1: 52225.4696988275, 2: 58524.881422371705, 3: 57762.97239178083, 4: 54959.91432748364, 5: 61054.94187125619, 6: 52872.45259469045, 7: 58771.73939111505, 8: 56512.79218479323, 9: 58007.35859670519}


In [None]:
model_5 = LatentDirichletAllocation(verbose=1, n_components=5, max_iter=40, evaluate_every=1, random_state=random_state_5[0])
model_5.fit(smallest_count_vec_data)

In [15]:
#with lzma.open('topic_5.model', "wb") as model_file:
#    pickle.dump(smallest_count_vec, model_file)
#    pickle.dump(model_5, model_file)

#### 10 topics

In [None]:
random_state_10 = [i for i in range(1, 21)]
random.seed(42)
compare_sources = random.sample(sources['drupal_path'].tolist(), 20)

perplexities_10 = {}
for i in range(10):
    print(i)
    model = LatentDirichletAllocation(n_components=10, evaluate_every=1, max_iter=40, random_state=random_state_10[i])
    model.fit(less_count_vec_data)
    compare_data = less_count_vec.transform([source_all_chants_dict[s] for s in compare_sources])
    perplexities_10[i] = model.perplexity(compare_data)

In [10]:
print(min(perplexities_10.values()))
print(perplexities_10)

19283.908158166683
{0: 22221.28680063228, 1: 20350.356529784898, 2: 21678.813526315622, 3: 23243.557698737804, 4: 23610.793130490492, 5: 23379.85579399877, 6: 19978.44829975222, 7: 19487.23741825979, 8: 19283.908158166683, 9: 23271.235943281205}


In [None]:
model_10 = LatentDirichletAllocation(verbose=1, n_components=10, max_iter=40, evaluate_every=1, random_state=random_state_10[8])
model_10.fit(less_count_vec_data)

In [12]:
#with lzma.open('topic_10.model', "wb") as model_file:
#    pickle.dump(all_count_vec, model_file)
#    pickle.dump(model_10, model_file)