In [1]:
import os
import pickle
import random

def pickler(path):
    pickles_read = {}
    pickles_unread = []

    for file in os.listdir(path):
        if file.endswith('.pickle'):
            file_path = os.path.join(path, file)
            ref = os.path.splitext(os.path.basename(file_path))[0]
            try:
                with open(file_path, "rb") as data:
                    pickles_read[ref] = pickle.load(data)
            except:
                pickles_unread.append(file)
    
    return pickles_read, pickles_unread

pickle_path = "C:/Users/Mitch/pickles/"
pickles, empty_pickles = pickler(pickle_path)

random_pickle_keys = random.sample(list(pickles.keys()), 100)
random_pickles = {key: pickles[key] for key in random_pickle_keys}

In [2]:
def combine(dictionary):
    combined_text = []
    for key in dictionary:
        inner_dict = dictionary[key]
        document_text = ""
        for inner_key in inner_dict:
            inner_text = inner_dict[inner_key]
            if inner_text is not None:
                document_text += inner_text
        combined_text.append(document_text)
    return combined_text

corpus = combine(random_pickles)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mitch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# only include those words that appear in less than 80% of the document and appear in at least 2 documents. 
cv = CountVectorizer(max_df=0.8, min_df=2, stop_words=stopwords.words('english'), lowercase=True)
x = cv.fit_transform(corpus)
x.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [3, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [5]:
# each of the 10 documents is represented as a 1908 dimensional vector, which means that our vocabularly has 1908 words
x

<100x16347 sparse matrix of type '<class 'numpy.int64'>'
	with 91647 stored elements in Compressed Sparse Row format>

In [6]:
cv.get_feature_names_out()

array(['00', '000', '0003', ..., 'zoning', 'zoological', 'zs'],
      dtype=object)

In [7]:
# here we pick 5 topics that we want out text to be divided into
LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(x)

In [8]:
import random

# fetch 10 random words from our vocabularly
for i in range(10):
    random_id = random.randint(0,len(cv.get_feature_names_out()))
    print(cv.get_feature_names_out()[random_id])

echnical
ependent
pigmented
attributed
design
236
ichever
2012
carbonate
plan1


In [9]:
# get 10 words with highest probability for the first topic
first_topic = LDA.components_[0]
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:
    print(cv.get_feature_names_out()[i])

provide
services
new
works
contractor
tender
01
block
shall
existing


In [10]:
# now we print out the 10 words with the highest probabilities for all the five topics we choose
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['provide', 'services', 'new', 'works', 'contractor', 'tender', '01', 'block', 'shall', 'existing']


Top 10 words for topic #1:
['requirements', 'design', 'part', 'services', 'respondent', 'general', 'contract', 'fire', 'shall', 'customer']


Top 10 words for topic #2:
['shall', 'must', 'site', 'general', 'works', 'work', 'principal', 'tender', 'contract', 'contractor']


Top 10 words for topic #3:
['department', 'electrical', 'cable', 'services', 'works', 'general', 'system', 'equipment', 'provide', 'shall']


Top 10 words for topic #4:
['technical', 'door', 'services', 'maintenance', 'mm', 'principal', 'required', 'works', 'contractor', 'must']


Top 10 words for topic #5:
['mid', 'tier', 'laser', 'cockburn', 'fleet', 'facilitation', 'denny', 'assist', 'avenue', 'communities']


Top 10 words for topic #6:
['work', 'must', 'tenderer', 'clause', 'works', 'principal', 'tender', 'contract', 'shall', 'contractor']


Top 10 words for topic #7:
['aug', '50', '10'

In [11]:
document_topic_distributions = LDA.transform(x)

# assign tenders back to the most likely topic
assigned_topics = [np.argmax(doc_topic_dist) for doc_topic_dist in document_topic_distributions]
for i, assigned_topic in enumerate(assigned_topics):
    print(f"Document {i} is assigned to topic {assigned_topic}")

Document 0 is assigned to topic 2
Document 1 is assigned to topic 2
Document 2 is assigned to topic 9
Document 3 is assigned to topic 1
Document 4 is assigned to topic 1
Document 5 is assigned to topic 3
Document 6 is assigned to topic 4
Document 7 is assigned to topic 9
Document 8 is assigned to topic 4
Document 9 is assigned to topic 4
Document 10 is assigned to topic 6
Document 11 is assigned to topic 6
Document 12 is assigned to topic 5
Document 13 is assigned to topic 6
Document 14 is assigned to topic 9
Document 15 is assigned to topic 6
Document 16 is assigned to topic 1
Document 17 is assigned to topic 8
Document 18 is assigned to topic 6
Document 19 is assigned to topic 2
Document 20 is assigned to topic 6
Document 21 is assigned to topic 6
Document 22 is assigned to topic 1
Document 23 is assigned to topic 2
Document 24 is assigned to topic 4
Document 25 is assigned to topic 3
Document 26 is assigned to topic 2
Document 27 is assigned to topic 2
Document 28 is assigned to top