In [1]:
import sys
import tomotopy as tp
import numpy as np
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup

def train(model, save_path):
    model.train(0)
    print('Num docs:', len(model.docs), ', Vocab size:', len(model.used_vocabs), ', Num words:', model.num_words)
    print('Removed top words:', model.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for _ in range(0, 200, 10):
        model.train(7)
        model.train(3, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}'.format(model.global_step, model.ll_per_word))

    for _ in range(0, 100, 10):
        model.train(10, freeze_topics=True)
        print('Iteration: {:05}\tll per word: {:.5f}'.format(model.global_step, model.ll_per_word))

    print('Saving...', file=sys.stderr, flush=True)
    model.save(save_path, True)
    
    


tenders_structured_path = r"UpdatedAgainTenders.xlsx"

tenders_structured = pd.read_excel(tenders_structured_path)
tenders_structured = tenders_structured[["Reference Number", "Contract Title", "Description"]].dropna(subset=["Reference Number"]).drop_duplicates()

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    cleaned = soup.get_text().replace('\xa0', ' ')
    cleaned = ' '.join(cleaned.split())
    return cleaned


from nltk.corpus import stopwords
try:
    corpus = tp.utils.Corpus.load('tender.cached.corpus')
except IOError:
    docs = []
    for index, row in tenders_structured.iterrows():
        title = row["Contract Title"]
        desc = remove_html_tags(row["Description"])
        docs.append(title + " " + desc)
    stops = set(stopwords.words('english'))
    corpus = tp.utils.Corpus(
        tokenizer=tp.utils.SimpleTokenizer(), # leave empty tokenizer as it doesnt seem to work without 
        stopwords=lambda x: len(x) <= 2 or x in stops
    )
    corpus.process(docs)
    corpus.save('tender.cached.corpus')

model = tp.HPAModel(tw=tp.TermWeight.ONE, k1=10, k2=100, rm_top=0, corpus=corpus)
train(model, "struc_tender.hpa.tmm")


KeyboardInterrupt



In [2]:
import sys
import tomotopy as tp
import numpy as np
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    cleaned = soup.get_text().replace('\xa0', ' ')
    cleaned = ' '.join(cleaned.split())
    return cleaned
tenders_structured_path = r"UpdatedAgainTenders.xlsx"

tenders_structured = pd.read_excel(tenders_structured_path)
tenders_structured = tenders_structured[["Reference Number", "Contract Title", "Description"]].dropna(subset=["Reference Number"]).drop_duplicates()
docs = []
for index, row in tenders_structured.iterrows():
        title = row["Contract Title"]
        desc = remove_html_tags(row["Description"])
        docs.append(title + " " + desc)

In [5]:
model = tp.HPAModel.load("struc_tender.hpa.tmm")


In [6]:
doc_index = 0
doc = model.docs[doc_index]
topic_probs = doc.get_topics()
print("Topic probabilities for document {}:".format(doc_index))
for topic, prob in topic_probs:
    print("Topic {}: {:.4f}".format(topic, prob))

Topic probabilities for document 0:
Topic 8: 0.3729
Topic 15: 0.2583
Topic 98: 0.1845
Topic 31: 0.1107
Topic 40: 0.0738
Topic 7: 0.0072
Topic 4: 0.0035
Topic 9: 0.0023
Topic 5: 0.0020
Topic 1: 0.0014


In [23]:
# Get the topic probabilities for the first document (index 0)
doc_index = 0
doc = model.docs[doc_index]
topic_probs = doc.get_topics()

print("Topic probabilities for document {}:".format(doc_index))

# Create a dictionary to store topic names
topic_names = {}

# Loop through the topics and store the top words as topic names
for topic_id, prob in topic_probs:
    topic_words = model.get_topic_words(topic_id, top_n=5)  # Adjust top_n as needed
    topic_name = ", ".join([word for word, weight in topic_words])
    topic_names[topic_id] = topic_name
    print("Topic {} ({}): {:.4f}".format(topic_id, topic_name, prob))

# Now you can access topic names using the topic_id
#for topic_id, topic_name in topic_names.items():
    #print("Topic {}: {}".format(topic_id, topic_name))

Topic probabilities for document 0:
Topic 8 (pta, required, shall, contractor, scope): 0.3729
Topic 15 (centre, state, health, services, coordination): 0.2583
Topic 98 (road, provision, traffic, services, supply): 0.1845
Topic 31 (supply, installation, delivery, equipment, maintenance): 0.1107
Topic 40 (project, conservation, perth, plan, biodiversity): 0.0738
Topic 7 (government, provide, future, needs, state): 0.0072
Topic 4 (requirements, system, systems, business, current): 0.0035
Topic 9 (project, change, management, manager, business): 0.0023
Topic 5 (program, services, work, personnel, experience): 0.0020
Topic 1 (pta, project, train, rail, signalling): 0.0014


In [38]:
def get_topic_distribution(model,document_index,n,with_topics=True):
    '''
    model: An HPA model 
    document_index: an index corresponding to the tender list used to train the model
    n: the number of topics you want returned in order of highest to lowest probability
    with_topics: boolean, if you want topic index, or topic name returned
    
    returns the topic distribution for a single document in the form of a list of tupals, of format [(list of topic words,prob),...] 
    '''
    doc = model.docs[doc_index]
    topic_probs = doc.get_topics()[:n:]
    output = []
    
   
        
    if with_topics:
        for topic_id, prob in topic_probs:
            topic_words = model.get_topic_words(topic_id, top_n=5) 
            words_temp = ", ".join([word for word, weight in topic_words])
            output.append((words_temp,round(prob,4)))
        return output
    else:
        return topic_probs
   

In [39]:
get_topic_distribution(model,0,2)

[('pta, required, shall, contractor, scope', 0.3729),
 ('centre, state, health, services, coordination', 0.2583)]

In [20]:
model.get_sub_topics(0)

[(85, 0.05263254791498184),
 (68, 0.029912302270531654),
 (92, 0.023323440924286842),
 (11, 0.022876355797052383),
 (79, 0.02126314677298069),
 (58, 0.0212631206959486),
 (16, 0.020874321460723877),
 (75, 0.0206608884036541),
 (44, 0.02046625129878521),
 (20, 0.02023320458829403)]

In [9]:
model.summary()

<Basic Info>
| HPAModel (current version: 0.12.5)
| 20625 docs, 614519 words
| Total Vocabs: 24872, Used Vocabs: 24872
| Entropy of words: 7.70177
| Entropy of term-weighted words: 7.70177
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 300, Burn-in steps: 0
| Optimization Interval: 1
| Log-likelihood per word: -9.62379
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k1: 10 (the number of super topics between 1 ~ 32767)
| k2: 100 (the number of sub topics between 1 ~ 32767)
| alpha: [0.1] (initial hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k1 + 1` of `float` in case of asymmetric prior.)
| subalpha: [0.1] (initial hyperparameter of Dirichlet distribution for super-sub topic, given as a single `float` in case of symmetric prior and as