### Standard Model Setup

In [1]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import pandas as pd

In [2]:
import sys
sys.path.append("../")
from src import puhti_files

In [3]:
### Load in data
df = puhti_files.genre_data_to_pandas(data="dev", add_labels=True, merge_ecco=True, better_subcat_names=True)

Read in dataset dev.csv. Set param 'data' to 'dev', 'test' or 'train' if you want another dataset.


In [4]:
### Sample data for faster training
### Stratified with main_category to ensure that all categories are included

from sklearn.model_selection import train_test_split
df, _ = train_test_split(df, train_size=30, stratify=df['main_category'], random_state=123)

print(len(df["main_category"].unique()))
print(len(df))

10
30


In [5]:
### Read in texts

df["text"] = df["document_id"].apply(lambda x: puhti_files.read_text_file(x))

In [6]:
### Create dataset with splitted documents. Document_id is kept for later merge.

def split_text(text, num_words):
    words = text.split()
    chunks = [' '.join(words[i:i + num_words]) for i in range(0, len(words), num_words)]
    return chunks

num_words = 128
split_data = []
for _, row in df.iterrows():
    doc_id = row['document_id']
    text = row['text']
    chunks = split_text(text, num_words)
    
    for chunk in chunks:
        split_data.append({'document_id': doc_id, 'text': chunk})

split_df = pd.DataFrame(split_data)
len(split_df)

25312

In [7]:
### Setting up stuff for the model. Details not important for this example.

from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
stopwords = list(stop_words.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/tturpein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
### Preprocess texts
### !!!NOTE: This function drops rows if processed document is empty. We need to keep this in mind when joining data back later
### We'll use the "retained_indices" later

sp = WhiteSpacePreprocessingStopwords(list(split_df["text"]), stopwords_list=stopwords, vocabulary_size=2000)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [9]:
### As can be seen from the lenghts, the preprocessing function has dropped rows
### The original data now has more rows
### Because the rows can be dropped from anywhere, the data don't match directly anymore

print(len(split_df))
print(len(preprocessed_documents))
print(len(unpreprocessed_corpus))

25312
25310
25310


In [10]:
### Data Preparation

tp = TopicModelDataPreparation("all-mpnet-base-v2")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/127 [00:00<?, ?it/s]

In [11]:
### Fitting the model. Using 10 topics here. Number not important for the example but variable used later.

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Gets rid of warnings

topics = 10
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=topics, num_epochs=10)
ctm.fit(training_dataset)

Epoch: [10/10]	 Seen Samples: [252800/253100]	Train Loss: 250.24247251100178	Time: 0:00:07.497389: : 10it [01:14,  7.41s/it]
100%|██████████| 396/396 [00:06<00:00, 59.28it/s] 


### Combining the Predictions for the Original Documents

In [12]:
### Topic predictions as percentages for each topic for each document chunk

topics_predictions = ctm.get_thetas(training_dataset, n_samples=5)
topics_predictions.shape

100%|██████████| 396/396 [00:06<00:00, 58.51it/s] 


(25310, 10)

In [13]:
### Adding the predictions to a dataframe
topic_columns = [f"topic_{i}_prob" for i in range(0, topics)]
predictions_df = pd.DataFrame(topics_predictions, columns=topic_columns)

In [14]:
### !!!NOTE: Because of dropped rows, the precictions dataframe doesn't match split_df
### retained_indices has the actual indices of the data. We use that to reset the index after which we can concat correctly

#Resetting the index 
predictions_df = predictions_df.set_index(pd.Index(retained_indices))
split_df = pd.concat([split_df, predictions_df], axis=1)
split_df.shape

(25312, 12)

In [15]:
### There shouldn't be prediction data for the missing rows
### First we just check the tail, so that the predictions didn't just concat from the top
### The tail should have proper data provided that they weren't the rows that were dropped

split_df.tail(5)

Unnamed: 0,document_id,text,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob
25307,229201100,"height: They steal wine who take it, When he's...",0.197272,0.079634,0.02855,0.038391,0.030669,0.303124,0.051662,0.029097,0.203182,0.038421
25308,229201100,"do its duty. Wine was the only Helicon, Whence...",0.020584,0.018548,0.036269,0.040286,0.03653,0.396715,0.037054,0.063699,0.336634,0.01368
25309,229201100,"thesis allow, You're a cuckold, fays the, do I...",0.297308,0.035864,0.024246,0.034924,0.101646,0.056621,0.036701,0.037634,0.168228,0.206828
25310,229201100,"Beauty by constraint poffefling, You enjoy but...",0.01174,0.083752,0.01972,0.016546,0.005298,0.467611,0.089696,0.020196,0.266411,0.019028
25311,229201100,but ill occasion; We only meet to celebrate Th...,0.199441,0.108153,0.03222,0.02707,0.052341,0.17278,0.087749,0.029083,0.185904,0.105259


In [16]:
### We obtain the indices of the rows that were dropped
### If there is a gap in retained_indices, then that row was dropped
### Note: this function doesn't work for sequential missing rows, but that's not a huge concern here

missing_rows = []
last = -1
for i in retained_indices:
    if last != (i-1):
        missing_rows.append(i-1)
    last = i

In [17]:
### Checking the indices where rows were dropped. These shouldn't have prediction data

split_df.loc[missing_rows]

Unnamed: 0,document_id,text,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob
1243,581801300,"viginti, r. triginta. F IN IS.",,,,,,,,,,
21241,644700500,"I I S. ·.:, ;' "". ..",,,,,,,,,,


In [18]:
### Now we can continue with ordinary matters
### Calculate mean probability from chunks for each document
### Then take the best probability as 'best_topic'
### This reduces the dataframe back to 1 document_id per row

mean_prob_df = split_df.groupby('document_id')[topic_columns].mean()
mean_prob_df['best_topic'] = mean_prob_df.idxmax(axis=1).str.replace('_prob', '').str.replace('topic_', '').astype(int)
print(len(mean_prob_df))
mean_prob_df.head(3)

30


Unnamed: 0_level_0,topic_0_prob,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob,best_topic
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5100101,0.075802,0.045416,0.066427,0.051594,0.042904,0.132047,0.352835,0.085567,0.103216,0.044192,6
11900103,0.058763,0.055537,0.085053,0.04343,0.041303,0.055415,0.061347,0.513766,0.045237,0.040149,7
18900102,0.084326,0.053208,0.094333,0.06236,0.050234,0.124165,0.333803,0.096352,0.051314,0.049904,6


In [19]:
### Finally merge to original dataframe
df = pd.merge(df, mean_prob_df, on='document_id', how='left')
df.head(3)

Unnamed: 0,document_id,work_id,main_category,sub_category,main_category_label,sub_category_label,publication_year,gatherings,total_price,publication_place,...,topic_1_prob,topic_2_prob,topic_3_prob,topic_4_prob,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob,best_topic
0,373000300,174-lucubrations of isaac bickerstaff,2,18,Literature,Other fiction,1710,12mo,,London,...,0.071519,0.101755,0.0649,0.06092,0.158996,0.223581,0.095156,0.063433,0.068418,6
1,128501700,60-gentle shepherd,0,1,Arts,"Music, hymns, songs",1769,12mo,,Edinburgh,...,0.075439,0.060078,0.055638,0.035147,0.408412,0.063354,0.040143,0.107057,0.060486,5
2,581801300,15584-life of william of wykeham bishop of win...,4,10,History,Biographical History,1759,8vo,,London,...,0.178303,0.131627,0.047478,0.052422,0.082215,0.06223,0.227451,0.060814,0.052888,7


### Extra: Comparing to premade categories

In [20]:
### We can also try fit our own topics to the premade categories in a way which results in greatest accuracy
### In my tests the accuracy is usually quite low, not that it necessarily matters

### Note: There might be also an error in the code here because there's a bit of copy-paste

In [21]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix

In [22]:
cm = confusion_matrix(df['main_category'], df['best_topic'])
cm

array([[0, 0, 0, 0, 0, 2, 0, 0, 2, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 2, 5, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 3, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 2, 0, 0, 0, 1, 0, 0]])

In [23]:
### Algorithm which determines best fit
row_ind, col_ind = linear_sum_assignment(-cm)

In [24]:
### Map from created topics to old
topic_mapping = {i: df['main_category'].unique()[j] for i, j in zip(col_ind, row_ind)}
topic_mapping

{5: 2, 0: 0, 6: 4, 9: 6, 7: 8, 2: 1, 1: 9, 4: 3, 8: 7, 3: 5}

In [25]:
### Add the mapped topics to dataframe
df['mapped_topic'] = df['best_topic'].map(topic_mapping)

In [26]:
### Gets accuracy of how many new topics == old topics
accuracy = np.mean(df['main_category'] == df['mapped_topic'])
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.10
