### Example of SuperCTM Model
- In SuperCTM, you can label the documents to aid the clustering
- One way would be to utilise the pre-provided categories (used here)
- One could also label a subset of data by hand as a seed data

### CTM Model Training

In [1]:
### GENERAL VARIABLES:
sample_size = 100
epochs = 10

#What categories to use, either main/10 or sub_cat/43 if you want proper fit for comparison
#This is really for the extra stuff at the end
topics = 10
compare_to = "main_category" #main_category#sub_category

In [2]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import pandas as pd

In [3]:
import sys
sys.path.append("../")
from src import puhti_files

In [4]:
### Load in data
df = puhti_files.genre_data_to_pandas(data="train", add_labels=True, merge_ecco=True, better_subcat_names=True)

Read in dataset train.csv. Set param 'data' to 'dev', 'test' or 'train' if you want another dataset.


In [5]:
#Sample data, ensure that every category is represented

import math
n_categories = len(df[compare_to].unique())
take_per_category = math.ceil(sample_size / n_categories)


# Group by sub_category and take one random sample from each group
df = df.groupby(compare_to, group_keys=False).apply(lambda x: x.sample(take_per_category, replace=True))
df = df.drop_duplicates()

print(len(df[compare_to].unique()))
print(len(df))

10
100


In [6]:
### Read in texts

df["text"] = df["document_id"].apply(lambda x: puhti_files.read_text_file(x))
df.head(3)

Unnamed: 0,document_id,work_id,main_category,sub_category,main_category_label,sub_category_label,publication_year,gatherings,total_price,publication_place,author_id,other_actors,text
23320,569102600,10017-battle of hastings,0,0,Arts,"Theatre, plays, opera",1778,12mo,,Dublin,76451457,robertmarchbank_2,"TIHE //""/*: - /s/ /'\nBattle of Hastings,\n\nA..."
1494,1096600100,9-hymns,0,1,Arts,"Music, hymns, songs",1790,12mo,,London,19686646,68548720,IH Y N S\n\nO F\nINTE RCESSION\n\nFOR\nALL MAN...
6931,733800500,513-funeral or grief a-la-mode,0,0,Arts,"Theatre, plays, opera",1790,12mo,,London,22167754,NV4420,"THE FUtNERAL;\n\nGRIEF A-L4- AI D R,\n\nC OMED..."


In [7]:
### Create dataset with splitted documents. Document_id is kept for later merge.

def split_text(text, num_words):
    words = text.split()
    chunks = [' '.join(words[i:i + num_words]) for i in range(0, len(words), num_words)]
    return chunks

num_words = 128
split_data = []
for _, row in df.iterrows():
    doc_id = row['document_id']
    main_category_label = row["main_category_label"]
    sub_category_label = row["sub_category_label"]
    
    text = row['text']
    chunks = split_text(text, num_words)
    
    for chunk in chunks:
        split_data.append({
            'document_id': doc_id,
            "main_category_label": main_category_label,
            "sub_category_label": sub_category_label,
            'text': chunk
        })

split_df = pd.DataFrame(split_data)
len(split_df)

66782

In [9]:
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
stopwords = list(stop_words.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/tturpein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
sp = WhiteSpacePreprocessingStopwords(list(split_df["text"]), stopwords_list=stopwords, vocabulary_size=2000)#vocabulary_size=2000
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()
print(len(preprocessed_documents))
print(len(unpreprocessed_corpus))

66765
66765


In [11]:
# We use "_label" column as labels.
# I'm not sure if the actual names really matter. Could be that you can use numeric ones.
labels = split_df[compare_to + "_label"].to_numpy()

In [12]:
#We have to drop the labels for the rows that the preprocessing drops
labels = labels[retained_indices]
labels = list(labels)
len(labels)

66765

In [14]:
tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")

training_dataset = tp.fit(
    text_for_contextual = unpreprocessed_corpus,
    text_for_bow = preprocessed_documents,
    labels = labels
)

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

In [15]:
### Checking how labels are encoded
### They should be one-hotted
training_dataset.__getitem__(0)["labels"]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [16]:
### Training the model with labels

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Gets rid of warnings

ctm = CombinedTM(
    bow_size=len(tp.vocab),
    contextual_size=768,
    n_components=topics,
    num_epochs=epochs,
    label_size=len(set(labels))
)

In [17]:
ctm.fit(training_dataset)

Epoch: [10/10]	 Seen Samples: [667520/667650]	Train Loss: 246.76384887987908	Time: 0:00:13.549540: : 10it [02:19, 13.92s/it]
100%|██████████| 1044/1044 [00:11<00:00, 92.58it/s]


In [18]:
### Topic predictions as percentages for each topic for each document chunk

topics_predictions = ctm.get_thetas(training_dataset, n_samples=5)
topics_predictions.shape

100%|██████████| 1044/1044 [00:12<00:00, 83.29it/s] 


(66765, 10)

### Comparing to premade categories
- The rest of the code is just a accuracy comparison to the older categories

In [19]:
### Adding the predictions to a dataframe
topic_columns = [f"topic_{i}_prob" for i in range(0, topics)]
predictions_df = pd.DataFrame(topics_predictions, columns=topic_columns)

In [20]:
### !!!NOTE: Because of dropped rows, the precictions dataframe doesn't match split_df
### retained_indices has the actual indices of the data. We use that to reset the index after which we can concat correctly

#Resetting the index 
predictions_df = predictions_df.set_index(pd.Index(retained_indices))
split_df = pd.concat([split_df, predictions_df], axis=1)
split_df.shape

(66782, 14)

In [21]:
### Calculate mean probability from chunks for each document
### Then take the best probability as 'best_topic'
### This reduces the dataframe back to 1 document_id per row

mean_prob_df = split_df.groupby('document_id')[topic_columns].mean()
mean_prob_df['best_topic'] = mean_prob_df.idxmax(axis=1).str.replace('_prob', '').str.replace('topic_', '').astype(int)
print(len(mean_prob_df))
mean_prob_df.head(3)

100


Unnamed: 0_level_0,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob,best_topic
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
36000301,0.039599,0.053394,0.141056,0.052451,0.060308,0.183007,0.285934,0.084858,0.048649,0.050744,6
54600105,0.049573,0.049268,0.489403,0.053756,0.043055,0.046321,0.081896,0.086069,0.054584,0.046075,2
54902900,0.053652,0.062471,0.168312,0.059912,0.055716,0.086901,0.261915,0.079225,0.111514,0.060382,6


In [22]:
### Finally merge to original dataframe
df = pd.merge(df, mean_prob_df, on='document_id', how='left')
df.head(3)

Unnamed: 0,document_id,work_id,main_category,sub_category,main_category_label,sub_category_label,publication_year,gatherings,total_price,publication_place,...,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob,best_topic
0,569102600,10017-battle of hastings,0,0,Arts,"Theatre, plays, opera",1778,12mo,,Dublin,...,0.056752,0.482701,0.045842,0.055106,0.043572,0.048222,0.089947,0.085164,0.051805,2
1,1096600100,9-hymns,0,1,Arts,"Music, hymns, songs",1790,12mo,,London,...,0.050728,0.50478,0.038393,0.039043,0.051786,0.029318,0.160318,0.047461,0.041301,2
2,733800500,513-funeral or grief a-la-mode,0,0,Arts,"Theatre, plays, opera",1790,12mo,,London,...,0.168704,0.151562,0.058825,0.04264,0.079528,0.106274,0.084968,0.20986,0.055933,8


In [24]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [25]:
cm = confusion_matrix(df[compare_to], df['best_topic'])
cm

array([[0, 0, 8, 0, 0, 0, 0, 0, 2, 0],
       [2, 0, 0, 0, 3, 0, 1, 0, 0, 4],
       [0, 0, 7, 0, 0, 0, 1, 0, 1, 1],
       [0, 2, 0, 0, 0, 2, 2, 0, 3, 1],
       [2, 2, 0, 0, 0, 0, 5, 0, 0, 1],
       [3, 0, 0, 7, 0, 0, 0, 0, 0, 0],
       [0, 0, 5, 0, 0, 0, 2, 0, 3, 0],
       [5, 0, 1, 0, 0, 0, 0, 0, 3, 1],
       [0, 0, 0, 0, 0, 5, 5, 0, 0, 0],
       [0, 0, 1, 0, 0, 3, 1, 3, 2, 0]])

In [28]:
# Find the best mapping
best_mapping = np.argmax(cm, axis=0)
best_mapping

array([7, 3, 0, 5, 1, 8, 4, 9, 3, 1])

In [29]:
mapping_dict = {i: category for i, category in enumerate(best_mapping)}
mapping_dict
len(mapping_dict)

10

In [30]:
df['best_fit_category'] = df['best_topic'].map(mapping_dict)

In [31]:
accuracy = accuracy_score(df['main_category'], df['best_fit_category'])
accuracy

0.45