In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import os

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.cluster import KMeans


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/tcd-corpus/TCD_All_Micro.txt
/kaggle/input/es-query-history/ws-query-history-v25.txt
/kaggle/input/es-query-history/es-query-history-v25.txt


In [2]:
start_time = time.time()

# Step 1: Read the corpus from the file 
#with open("/kaggle/input/es-query-history/es-query-history-v25.txt", "r", encoding="utf-8") as file:

with open("/kaggle/input/tcd-corpus/TCD_All_Micro.txt", "r", encoding="utf-8") as file:
    all_lines = file.readlines()  # Read all lines from the file
    percent_count = int(len(all_lines) * 0.1)  # quick run:  get n% of the total number of lines
    lines = all_lines[:percent_count]          # quick run: keep only the first n% of lines



# Remove any empty lines and preprocess each line into tokens
tokenized_corpus = [simple_preprocess(line) for line in lines if line.strip()]

# Check if we have any data
if not tokenized_corpus:
    raise ValueError("The input file is empty or contains no valid text.")



# import the ES two-column query history 
#query_history_file="/kaggle/input/es-query-history/es-query-history-v25.txt"
#column_names = ['queries', 'volume']
#es_df = pd.read_csv(
#    query_history_file,
#    sep ="\t",
#    names=column_names,
#    header=None
#)

print("This step took --- %s seconds ---" % (time.time() - start_time))

This step took --- 26.31045913696289 seconds ---


In [3]:
start_time = time.time()
# Step 2: Train a Word2Vec model on the tokenized corpus
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Extract the vocabulary words from the model.
words = list(model.wv.index_to_key)
#print("Vocabulary:", words)

print("This step took --- %s seconds ---" % (time.time() - start_time))

This step took --- 35.07724380493164 seconds ---


In [4]:
# Step 3: Extract the corresponding word embeddings
word_vectors = np.array([model.wv[word] for word in words])



In [5]:
start_time = time.time()
# Step 4: Use K-Means clustering to cluster the word embeddings into a predefined number of clusters.
num_clusters = 1000  # Trial and error suggests 1000 is good compromise to produce 3 or 4 in each.
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(word_vectors)
labels = kmeans.labels_

# Organize words by their assigned cluster.
clusters = {}
for word, label in zip(words, labels):
    clusters.setdefault(label, []).append(word)

print("This step took --- %s seconds ---" % (time.time() - start_time))



This step took --- 1467.7195312976837 seconds ---


In [6]:
start_time = time.time()
# Step 5: Print out the clusters.
#for cluster_id, word_list in clusters.items():
#    print(f"Cluster {cluster_id}: {word_list}")
target_word = "geology"  # Change this to the word you want to look up.

if target_word in words:
    # Find the label for the target word.
    target_index = words.index(target_word)
    target_label = labels[target_index]
    
    # Print out the cluster that contains the target word.
    print(f"Cluster containing '{target_word}': {clusters[target_label]}")
else:
    print(f"'{target_word}' not found in the vocabulary.")


print("This step took --- %s seconds ---" % (time.time() - start_time))

Cluster containing 'geology': ['botany', 'geology', 'classics', 'zoology']
This step took --- 0.0007977485656738281 seconds ---
