In [1]:
import os
# Data processing
import pandas as pd
import numpy as np
# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP
import gensim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KRG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KRG\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KRG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
os.getcwd()

'C:\\Users\\KRG\\Desktop\\github_repo_placement\\placement_analysis\\Kritank'

In [3]:
os.chdir("../theory/OperatingSystems")

In [5]:
content = pd.read_csv('merged_train.csv', sep='\t', names=['review'])
content.head(10)

Unnamed: 0,review
0,INTRODUCTION A modern computer consists of one...
1,All in all a complex system.oo If every applic...
2,Furthermore managing all these components and ...
3,For this reason computers are equipped with a ...
4,Operating systems are the subject of this book
5,Most readers will have had some experience wit...
6,The program that users interact with usually c...
7,A simple overview of the main components under...
8,Here we see the hardware at the bottom
9,The hardware consists of chips boards disks a ...


In [6]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47816 entries, 0 to 47815
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  47816 non-null  object
dtypes: object(1)
memory usage: 373.7+ KB


In [7]:
# content['review'] = content['review'].apply(lambda x: ' '.join([w for w in gensim.utils.simple_preprocess(x) if w.lower() not in gensim.parsing.preprocessing.STOPWORDS and len(w) > 3]))
# Lemmatization
content['review'] = content['review'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in gensim.utils.simple_preprocess(x) if w not in gensim.parsing.preprocessing.STOPWORDS and len(w) > 2]))

In [8]:
content.head(10)

Unnamed: 0,review
0,introduction modern consists processor main me...
1,complex application programmer understand thin...
2,furthermore managing component optimally excee...
3,reason computer equipped layer software called...
4,operating system subject book
5,reader experience operating window linux freeb...
6,program user interact usually called shell tex...
7,simple overview main component discussion give...
8,hardware
9,hardware consists chip board disk keyboard mon...


In [26]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47816 entries, 0 to 47815
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  47816 non-null  object
dtypes: object(1)
memory usage: 373.7+ KB


In [9]:
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='correlation', 
                  random_state=100)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, n_gram_range=(1,3), language="english", calculate_probabilities=True)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(content['review'])

In [10]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,20524,-1_memory_page_process_data
1,0,1087,0_virtual machine_guest_virtualization_hypervisor
2,1,505,1_cache_caching_cached_cache manager
3,2,444,2_deadlock_deadlocked_deadlock occur_resource ...
4,3,383,3_interrupt_interrupt handler_handler_interrup...
...,...,...,...
610,609,10,609_flagi_flagj_critical section_enter critica...
611,610,10,610_backup_making backup_window convenient poi...
612,611,10,611_starvation_starvation problem_farmer_aging
613,612,10,612_multicomputers_multiprocessor multicompute...


In [12]:
from gensim import corpora
from gensim.models import CoherenceModel
documents = pd.DataFrame({"Document": content['review'],
                          "ID": range(len(content['review'])),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='u_mass')
coherence = coherence_model.get_coherence()
print(coherence)

nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [14]:
# Get top 10 terms for a topic
topic_model.get_topic(0)

[('virtual machine', 0.016976794811772496),
 ('guest', 0.016798640939237077),
 ('virtualization', 0.015217240823398214),
 ('hypervisor', 0.012887836888096124),
 ('vmm', 0.011739888133705403),
 ('machine', 0.011348276683754252),
 ('virtual', 0.011233645007261334),
 ('vmware', 0.010297792638238185),
 ('hypervisors', 0.008869048846111295),
 ('guest operating', 0.0073994343528895395)]

In [15]:
# Get probabilities for all topics
topic_model.probabilities_[0]

array([0.00040302, 0.0005517 , 0.0003091 , 0.00047509, 0.00172746,
       0.00035664, 0.00044643, 0.00040021, 0.00053183, 0.00102973,
       0.00046515, 0.00078126, 0.00046194, 0.00031842, 0.00043745,
       0.00037928, 0.00042938, 0.00033521, 0.00048571, 0.00046342,
       0.00064729, 0.0003193 , 0.00090665, 0.00048462, 0.00050724,
       0.00043225, 0.00083264, 0.00104494, 0.00061735, 0.00035804,
       0.00032239, 0.00057781, 0.00069142, 0.00053597, 0.00059149,
       0.00056174, 0.00046636, 0.00064306, 0.00060546, 0.00052581,
       0.00046434, 0.00114451, 0.00036601, 0.00112027, 0.0007442 ,
       0.00048759, 0.0006168 , 0.00055846, 0.00048926, 0.00060205,
       0.00053898, 0.00053862, 0.00074051, 0.00053985, 0.00082998,
       0.00091978, 0.00085356, 0.0008643 , 0.00037927, 0.00042011,
       0.0007122 , 0.00061644, 0.00044495, 0.00060168, 0.00059193,
       0.00063072, 0.00065463, 0.00057856, 0.00080378, 0.00045059,
       0.0003919 , 0.00054343, 0.00045866, 0.00044226, 0.00083

In [16]:
#this saves the predicted topics for each row in the csv file
topic_prediction = topic_model.topics_[:]
print(topic_prediction)

[215, 133, -1, -1, 22, 38, -1, -1, 148, 215, 148, 88, 88, 286, 32, 523, -1, 88, 22, 79, 275, 79, -1, 270, -1, -1, 88, -1, 19, 88, -1, -1, 146, 99, -1, -1, -1, 103, 87, 85, 133, 99, 379, -1, 479, -1, 121, -1, 167, 223, -1, 68, -1, 231, -1, 88, -1, -1, -1, 31, -1, -1, -1, 461, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, 137, -1, -1, 22, 436, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, 41, -1, -1, -1, 22, -1, 346, 215, -1, -1, 43, 286, -1, -1, 43, 43, -1, 323, -1, -1, -1, 12, -1, -1, -1, 43, -1, 12, -1, -1, -1, -1, -1, 33, 231, -1, 22, 231, 275, -1, -1, -1, -1, -1, -1, -1, -1, 314, -1, -1, 481, -1, -1, -1, -1, -1, -1, 103, 22, -1, 47, -1, 26, 26, -1, -1, -1, -1, -1, -1, 416, 26, 43, 26, 416, -1, 60, 238, 26, 126, -1, 26, 26, 26, 26, 26, 238, -1, 26, 26, -1, 26, -1, 26, -1, 538, -1, 416, 586, 26, -1, -1, 41, -1, 416, 126, -1, -1, 89, 416, 275, 26, -1, -1, 47, 318, -1, 32, 344, 295, -1, 126, -1, 126, -1, -1, -1, -1, -1, -1, -1, -1, -1, 60, 126, -1, 103, -1, -1, -1, 5, 22, 22, 22,

In [17]:
#for predicting the topics for any new sentence
new_check = 'Can you explain the concept of file system fragmentation and its impact on system performance? How can it be mitigated?'

# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_check, top_n=num_of_topics)

print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

#for printing those 3 topics
for index, top in enumerate(similar_topics):
  keys = [t[0] for t in topic_model.get_topic(top)]
  print(f'{keys} : with probability \n{np.round(similarity,2)[index]}')

The top 3 similar topics are [127, 477, 49], and the similarities are [0.67 0.53 0.5 ]
['fragmentation', 'internal fragmentation', 'external fragmentation', 'internal', 'external', 'fragmentation problem', 'fragmentation external', 'fragmentation external fragmentation', 'fragmentation internal', 'fragmentation fragmentation'] : with probability 
0.67
['internal fragmentation', 'fragmentation', 'internal', 'page size', 'large page', 'large page size', 'size', 'small page size', 'table size', 'small page'] : with probability 
0.53
['file system', 'filesystem', 'filesystem implementation', 'system', 'multiple file', 'multiple file system', 'system file', 'file', 'implementation', 'file system file'] : with probability 
0.5


In [18]:
# Save the topic model
topic_model.save("model_corr_ngram_len_2")

# Load the topic model
# my_model = BERTopic.load("amz_review_topic_model")

  self._set_arrayXarray(i, j, x)
