In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import os

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.cluster import KMeans


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/tcd-corpus/TCD_All_Micro.txt
/kaggle/input/es-query-history/ws-query-history-v25.txt
/kaggle/input/es-query-history/es-query-history-v25.txt


In [2]:
start_time = time.time()

# Step 1: Read the corpus from the file 
#with open("/kaggle/input/es-query-history/es-query-history-v25.txt", "r", encoding="utf-8") as file:

with open("/kaggle/input/tcd-corpus/TCD_All_Micro.txt", "r", encoding="utf-8") as file:
    all_lines = file.readlines()  # Read all lines from the file
    percent_count = int(len(all_lines) * 1.0 )  # quick run:  get n% of the total number of lines
    lines = all_lines[:percent_count]          # quick run: keep only the first n% of lines



# Remove any empty lines and preprocess each line into tokens
tokenized_corpus = [simple_preprocess(line) for line in lines if line.strip()]

# Check if we have any data
if not tokenized_corpus:
    raise ValueError("The input file is empty or contains no valid text.")



# import the ES two-column query history 
#query_history_file="/kaggle/input/es-query-history/es-query-history-v25.txt"
#column_names = ['queries', 'volume']
#es_df = pd.read_csv(
#    query_history_file,
#    sep ="\t",
#    names=column_names,
#    header=None
#)

print("This step took --- %.1f seconds ---" % (time.time() - start_time))

This step took --- 115.9 seconds ---


In [3]:
start_time = time.time()
# Step 2: Train a Word2Vec model on the tokenized corpus
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Extract the vocabulary words from the model.
words = list(model.wv.index_to_key)
#print("Vocabulary:", words)

print("This step took --- %.1f seconds ---" % (time.time() - start_time))

This step took --- 307.8 seconds ---


In [4]:
# Step 3: Extract the corresponding word embeddings
word_vectors = np.array([model.wv[word] for word in words])



In [5]:
start_time = time.time()
# Step 4: Use K-Means clustering to cluster the word embeddings into a predefined number of clusters.
num_clusters = 1000  # Trial and error suggests 2000 is good compromise to produce 3 or 4 in each.
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(word_vectors)
labels = kmeans.labels_

# Organize words by their assigned cluster.
clusters = {}
for word, label in zip(words, labels):
    clusters.setdefault(label, []).append(word)

print("This step took --- %.1f seconds ---" % (time.time() - start_time))

This step took --- 384.4 seconds ---


In [6]:
# Step 5: Print out the clusters.
#for cluster_id, word_list in clusters.items():
#    print(f"Cluster {cluster_id}: {word_list}")
target_word = "geology"  # Change this to the word you want to look up.

if target_word in words:
    # Find the label for the target word.
    target_index = words.index(target_word)
    target_label = labels[target_index]
    
    # Print out the cluster that contains the target word.
    print(f"Cluster containing '{target_word}': {clusters[target_label]}")
else:
    print(f"'{target_word}' not found in the vocabulary.")


Cluster containing 'geology': ['history', 'humanities', 'geography', 'medieval', 'ancient', 'classics', 'geology', 'histories', 'clu', 'archaeology']


In [7]:
# Step 5: Print out the clusters.
#for cluster_id, word_list in clusters.items():
#    print(f"Cluster {cluster_id}: {word_list}")
target_word = "vacancies"  # Change this to the word you want to look up.

if target_word in words:
    # Find the label for the target word.
    target_index = words.index(target_word)
    target_label = labels[target_index]
    
    # Print out the cluster that contains the target word.
    print(f"Cluster containing '{target_word}': {clusters[target_label]}")
else:
    print(f"'{target_word}' not found in the vocabulary.")


Cluster containing 'vacancies': ['queries', 'calls', 'instructions', 'lists', 'regularly', 'spam', 'incoming', 'contacts', 'checks', 'anonymous', 'alerts', 'commands', 'enquiries', 'registers', 'emails', 'advertisement', 'tweets', 'logs', 'credentials', 'electronically', 'subscribers', 'notices', 'webpage', 'searches', 'periodically', 'advertisements', 'vacancies', 'tokens', 'releases', 'permissions', 'replies', 'announcements', 'passwords', 'reminders', 'cookies', 'subscriptions', 'mails', 'urls', 'mailing', 'webpages']


# Elbow Method

The Elbow method involves plotting the objective value against various values of k. The goal is to identify the 'elbow point' where increasing k yields diminishing returns in performance. While this method is widely used, it can be subjective and may not always provide a clear answer.


k=100
Cluster containing 'geology': ['si', 'water', 'novel', 'highly', 'effects', 'single', 'attention', 'effect', 'metal', 'dna', 'food', 'investigation', 'large', 'series', 'pain', 'scale', 'induced', 'spectroscopy', 'power', 'electrochemical', 'light', 'chiral', 'elguero', 'reactions', 'sea', 'surfaces', 'cell', 'electron', 'loss', 'brown', 'nanowires', 'phase', 'fetal', 'microscopy', 'magnetic', 'crystal', 'devices', 'geology', 'films', 'enhanced', 'permit', 'derivatives', 'cost', 'resolution', 'cl', 'hydrogen', 'species', 'ru', 'structures', 'reaction', 'silver', 'polymer', 'visual', 'oxide', 'observational', 'alkorta', 'poor', 'evolution', 'cells', 'ap', 'controlled', 'dynamics', 'growth', 'ruthenium', 'diseases', 'mapping', 'nanotubes', 'bulletin', 'inflammatory', 'solution', 'imaging', 'compounds', 'laser', 'rate', 'gold', 'nanoscale', 'graphene', 'scanning', 'latin', 'iv', 'ar', 'anti', 'ward', 'structural', 'formation', 'freshwater', 'clothing', 'silicon', 'phys', 'acid', 'ion', 'micro', 'bound', 'countr1ow', 'nanoparticle', 'mo', 'bond', 'spatial', 'african', 'liquid', 'catalysts', 'containing', 'thin', 'efficient', 'dot', 'double', 'unwin', 'mediated', 'ray', 'freud', 'spectroscopic', 'substituted', 'nanowire', 'bis', 'cdte', 'absorption', 'forest', 'composites', 'poly', 'roman', 'kim', 'porous', 'transient', 'gaponik', 'resonance', 'ag', 'oliver', 'sarah', 'feeling']



k=1000 and percent=0.01
Cluster containing 'geology': ['series', 'inorganic', 'childhood', 'earth', 'geology', 'plant']


k=1000 and percent=0.05
Cluster containing 'geology': ['medicine', 'botany', 'geology', 'genetics', 'pharmacy', 'faculty', 'humanities', 'microbiology', 'mathematics', 'histories', 'biochemistry', 'sociology', 'biomedical', 'zoology']


k=1000 and percent=0.1 
Cluster containing 'geology': ['botany', 'microbiology', 'geology', 'genetics', 'immunology', 'biochemistry', 'zoology']

k=1000 and percent=0.5 (i.e. 50%)
Cluster containing 'geology': ['geography', 'geology', 'botany', 'zoology']

k=1000 and percent=1.0 (i.e. 100%)
Cluster containing 'geology': ['earth', 'geology', 'coronal', 'corona', 'dating', 'rocks', 'astronomy', 'carboniferous', 'atmospheric', 'giant', 'mineral', 'stellar', 'quaternary', 'extinction', 'universe', 'terrestrial', 'sediment', 'vegetation', 'oceans', 'mantle', 'planetary', 'offshore', 'zircon', 'apatite', 'sedimentary', 'climatic', 'geophysical', 'isotope', 'geochemistry', 'volcanic', 'winds', 'karst', 'planets', 'crust', 'geomorphology', 'sediments', 'glacial', 'metamorphism', 'igneous', 'metamorphic', 'geochemical', 'oceanic', 'magma', 'jurassic', 'tectonic', 'holocene', 'triassic', 'magmatic']

k=2000 and 100%
Cluster containing 'geology': ['botany', 'geology', 'zoology']

k=1000 and 100% 
Cluster containing 'geology': ['botany', 'geology', 'zoology']

k=2000 and 100% dimensions = 200
Cluster containing 'geology': ['botany', 'geology', 'museum', 'zoology', 'geological', 'fagel', 'garden', 'gardens', 'herbarium', 'botanic', 'botanical']

k=3000 and 100% dimensions = 200
Cluster containing 'geology': ['geology']

k=1500 and 100% dimensions = 200
Cluster containing 'geology': ['tr', 'geography', 'geology', 'nanoscience', 'geoscience', 'ggu']

k=1500 and 100% dimensions = 50
Cluster containing 'geology': ['humanities', 'geography', 'botany', 'geology', 'zoology']

k=3000 and 100% dimensions = 50





