In [1]:
# ====================== Base Installations ======================
!pip install gensim tensorflow plotly

import os, requests, re, shutil, random, warnings
import numpy as np, pandas as pd, nltk
import matplotlib.pyplot as plt, plotly.express as px
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
import tensorflow as tf

# reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# ----------------------  NLTK download  ----------------------
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()
stop_words  = set(stopwords.words('english'))

# ============================================================
# 1) Text Files – Relatively Large Project Gutenberg IDs List
# ============================================================
GUTENBERG_IDS = [
    1342, 84, 98, 2701, 1661, 11, 1080, 174, 25344, 28054,
    1232, 2554, 160, 2852, 345, 5200, 100
]
BOOK_SOURCE_URLS = [f'https://www.gutenberg.org/files/{gid}/{gid}-0.txt'
                    for gid in GUTENBERG_IDS]

DOWNLOAD_DIR = '/content/processed_text_data'
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def download_text(url, path):
    r = requests.get(url, stream=True); r.raise_for_status()
    with open(path, 'wb') as f:
        for chunk in r.iter_content(8192):
            f.write(chunk)

# ============================================================
# 2) Cleaning Gutenberg Headers/Copyright Boilerplate
# ============================================================
def strip_gutenberg_boilerplate(text: str) -> str:
    start = text.find('*** START OF')
    end   = text.find('*** END OF')
    if start != -1 and end != -1 and end > start:
        return text[start:end]
    return text

# ---------- NLP Helpers ----------
def wn_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return {'J': wordnet.ADJ, 'N': wordnet.NOUN,
            'V': wordnet.VERB, 'R': wordnet.ADV}.get(tag, wordnet.NOUN)

def preprocess(dir_path):
    sentences, labels = [], []
    for fn in os.listdir(dir_path):
        label = 0 if '100-' in fn else 1   # Example of rough labeling
        with open(os.path.join(dir_path, fn), 'r', encoding='utf-8', errors='ignore') as f:
            raw = strip_gutenberg_boilerplate(f.read())
        txt = re.sub(r'[^a-zA-Z\s.!?]+', ' ', raw.lower())
        for sent in (s.strip() for s in sent_tokenize(txt) if s.strip()):
            toks = [lemmatizer.lemmatize(w, wn_pos(w))
                    for w in word_tokenize(sent)
                    if w.isalpha() and w not in stop_words]
            if toks:
                sentences.append(toks)
                labels.append(label)
    return sentences, labels

# ------------- Download + Clean Working Directory ------------
for x in os.listdir(DOWNLOAD_DIR):
    p = os.path.join(DOWNLOAD_DIR, x)
    shutil.rmtree(p) if os.path.isdir(p) else os.unlink(p)

for i, url in enumerate(BOOK_SOURCE_URLS):
    fname = url.split('/')[-1]
    download_text(url, os.path.join(DOWNLOAD_DIR, fname))

sentences, labels = preprocess(DOWNLOAD_DIR)
print(f'Sentences kept: {len(sentences):,d}')

# ============================================================
# 3) Word2Vec-CBOW (min_count=5 – Reduces Noise and Memory)
# ============================================================
w2v = Word2Vec(
    sentences=sentences,
    vector_size=150,
    window=6,
    min_count=5,
    sg=0, epochs=105,
    workers=os.cpu_count(),
    seed=SEED
)
w2v.wv.fill_norms(force=True)

# ============================================================
# 4) t-SNE – Only Words with Frequency ≥ 30 (Optional but Saves Time)
# ============================================================
freq_thresh = 30
keep_words   = [w for w in w2v.wv.index_to_key
                if w2v.wv.get_vecattr(w, 'count') >= freq_thresh]
vec_matrix   = np.vstack([w2v.wv[w] for w in keep_words])

vecs_2d = TSNE(
    n_components=2, perplexity=30,
    random_state=SEED, max_iter=3000,
    learning_rate=205
).fit_transform(vec_matrix)

df = pd.DataFrame(vecs_2d, columns=['x', 'y'])
df['Word'] = keep_words

# ============================================================
# 5) Clustering – Number of Clusters Chosen by BIC (Prevents Silhouette-OOM)
# ============================================================
bic_scores, gmm_models = {}, {}
for n in range(5, 21):
    gmm = GaussianMixture(n, n_init=5, random_state=SEED).fit(vecs_2d)
    bic_scores[n] = gmm.bic(vecs_2d)
    gmm_models[n] = gmm

best_n = min(bic_scores, key=bic_scores.get)
print('Best N (by BIC):', best_n)

df['Cluster'] = gmm_models[best_n].predict(vecs_2d)

# ----------- 5 Largest Clusters -----------
largest5 = df['Cluster'].value_counts().head(5).index
print("\n--- Largest Clusters ---")
for cid in largest5:
    sub = df[df['Cluster'] == cid]
    sample = sub['Word'].sample(min(25, len(sub)), random_state=SEED)
    print(f"\nCluster {cid}  (size={len(sub)}):")
    print("  Sample words:", ', '.join(sample))

# ----------- Interactive Plot -----------
fig = px.scatter(
    df[df['Cluster'].isin(largest5)],
    x='x', y='y', color='Cluster',
    hover_name='Word', title='t-SNE – 5 largest clusters'
)
fig.update_traces(marker=dict(size=4, opacity=0.7))
fig.update_layout(width=1000, height=800, hovermode='closest')
fig.show()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Sentences kept: 209,471
Best N (by BIC): 5

--- Largest Clusters ---

Cluster 0  (size=1659):
  Sample words: whereon, branch, forthwith, oberon, favourable, agatha, obeyed, christmas, nurse, gower, provoke, emigrant, shift, doubtless, willow, richer, enforce, executioner, spoil, marya, inheritance, harmony, forbear, detest, scold

Cluster 3  (size=1145):
  Sample words: trouble, alarmed, acquaint, behaviour, hide, reflect, suffering, kind, repent, prove, single, accord, happens, refuse, punish, precisely, anything, performance, neither, karamazov, clearly, punishment, right, design, infinitely

Cluster 2  (size=844):
  Sample words: coal, fury, perfume, ox, pale, wolf, porch, alice, hook, step, knot, whip, twould, wail, skirt, huge, cotton, blush, starve, hiss, salt, deck, eat, gloom, drown

Cluster 4  (size=824):
  Sample words: impressed, laid, morning, instant, remove, awake, hotly, raskolnikov, clutch, vexed, grind, spoken, shudder, saw, absorbed, eagerness, arm, knock, shaken, af