# **Problem Statement for Students**
You're tasked with analyzing the text of Alice in Wonderland to understand the structure and meaning of its words using natural language processing (NLP) and data visualization techniques.

## Goals:
- Clean and preprocess the text

- Visualize word frequencies

- Word cloud

- Bar chart

- Plot semantic relationships

- Use GloVe embeddings + PCA

- Display word similarities

- Heatmap

# Intall Library

In [None]:
!pip install gensim -q


In [None]:
!pip install scipy --upgrade -q

# Import Library

In [None]:
import requests
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter

import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.patches as mpatches

import plotly.express as px
import plotly.graph_objects as go

import gensim.downloader as api

# Load and Clean the Text

In [None]:
# load the text
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text
text

In [None]:
# Clean text
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
words = text.split()
print(f"total words: {len(words)}")

In [None]:
text[:500]

# Preprocessing

## Get tokenization

In [None]:
# Download NLTK's Punkt tokenizer model
nltk.download('punkt_tab')

# Break down the text into lexical tokens (words and punctuation)
word_tokens  = word_tokenize(text)

In [None]:
print(f'{word_tokens} \n')
print(f'total tokens: {len(word_tokens)}')

## stopwords

In [None]:
# get list stopwords
nltk.download('stopwords')

# Get stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Create custom stopwords to remove more common abbreviations
custom_stopwords = {
    "arent", "cant", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", "havent",
    "hed", "hell", "hes", "id", "ill", "im", "ive",
    "isnt", "lets", "mightnt", "mustnt", "shant",
    "shed", "shell", "shes",
    "shouldnt", "thats", "theres", "theyd", "theyll", "theyre", "theyve",
    "wed", "were", "weve", "werent",
    "whatll", "whatre", "whats", "whatve",
    "wheres", "whod", "wholl", "whore", "whos", "whove",
    "wont", "wouldnt", "youd", "youll", "youre", "youve"
}

# update stopwords
stop_words.update(custom_stopwords)
print(f'stopwords: {stop_words} \n')


In [None]:
# remove stopdwords
clean_tokens  = [word for word in word_tokens if word not in stop_words and len(word) > 2]
print(f"clean tokens: {clean_tokens} \n")
print(f'total tokens:{len(clean_tokens)}')

# Visualization

## word cloud

In [None]:
word_freq = Counter(clean_tokens)

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Alice in Wonderland")
plt.show()

## bar chart

In [None]:
# Bar Chart
most_common = word_freq.most_common(20)
words_, freqs = zip(*most_common)

plt.figure(figsize=(10, 5))
plt.bar(words_, freqs)
plt.xticks(rotation=45)
plt.title("Top 20 Most Frequent Words")
plt.show()

## GloVe Embeddings

In [None]:
# Load GloVe embeddings (50-d)
glove_vectors = api.load("glove-wiki-gigaword-50")

In [None]:
# Get a list of unique tokens (remove duplicates from the cleaned tokens)
unique_tokens = list(set(clean_tokens))
print(f'Total unique tokens: {len(unique_tokens)}\n')

# Filter out tokens that exist in the GloVe vocabulary
filtered_words = [word for word in unique_tokens if word in glove_vectors]
print(f'Filtered words (found in GloVe): {filtered_words}\n')
print(f'total filtered words: {len(filtered_words)} \n')

# Create a NumPy array of word embeddings (vectors) for the filtered words
# Each word is represented as a numerical vector (e.g., 50-dimensional if using GloVe-50D)

embeddings = np.array([glove_vectors[word] for word in filtered_words])
print(f'Embeddings array (vector representation of words):\n{embeddings}')


## Part-of-Speech Tagging

In [None]:
# Download the tagger resource (ENG)
nltk.download('averaged_perceptron_tagger_eng')

# Tag POS for each word
pos_tags = nltk.pos_tag(filtered_words)
print(f'Tag POS: {pos_tags} \n')

In [None]:
# Simplify POS tag
def simplify_pos(tag):
    if tag.startswith('NN'):
        return 'noun'
    elif tag.startswith('VB'):
        return 'verb'
    elif tag.startswith('JJ'):
        return 'adjective'
    elif tag.startswith('RB'):
        return 'adverb'
    else:
        return 'other'

word_pos = {word: simplify_pos(tag) for word, tag in pos_tags}
print(f'word_pos: {word_pos} \n')

In [None]:
color_map = {
    'noun': 'red',
    'verb': 'blue',
    'adjective': 'green',
    'adverb': 'purple',
    'other': 'gray'
}

## PCA

In [None]:
# PCA to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)

In [None]:
# word frequency
word_freq = Counter(clean_tokens)
print(f'word freq: {word_freq}')

In [None]:
# Prepare data for Plotly
x_vals = reduced[:, 0]
y_vals = reduced[:, 1]
colors = [color_map.get(word_pos[word], 'black') for word in filtered_words]
sizes = [min(word_freq[word] * 3, 40) for word in filtered_words]
hover_texts = [f"{word}<br>POS: {word_pos[word]}<br>Freq: {word_freq[word]}" for word in filtered_words]

In [None]:
# Create scatter plot
fig = go.Figure()

# Add traces for each POS type to show legend color
for pos_type, color in color_map.items():
    # Get all indices of words for this POS type
    indices = [i for i, word in enumerate(filtered_words) if word_pos[word] == pos_type]
    if indices:
        fig.add_trace(go.Scatter(
            x=[x_vals[i] for i in indices],
            y=[y_vals[i] for i in indices],
            mode='markers',
            name=pos_type.capitalize(),
            marker=dict(
                size=[sizes[i] for i in indices],
                color=color,
                opacity=0.7,
                line=dict(width=0.5, color='black')
            ),
            hovertext=[hover_texts[i] for i in indices],
            hoverinfo='text',
            showlegend=True
        ))

# Customize layout with legend on the left
fig.update_layout(
    title='PCA of Word Embeddings<br><sup>Color = POS Type | Size = Word Frequency</sup>',
    xaxis_title='PC1',
    yaxis_title='PC2',
    template='plotly_white',
    width=950,
    height=700,
    legend=dict(
        title='Part of Speech',
        x=0.01,
        xanchor='left',
        y=1,
        yanchor='top',
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='black',
        borderwidth=1
    )
)

fig.show()

# Heatmap

In [None]:
# Number of top frequent words to include in similarity analysis
top_words_count = 20

# Extract top frequent words (expand selection to ensure valid GloVe coverage)
frequent_words = [word for word, _ in word_freq.most_common(top_words_count * 3)]

# Keep only words that are present in the GloVe vocabulary
words_in_glove = [word for word in frequent_words if word in glove_vectors][:top_words_count]

# Ensure there are enough valid words for plotting
enough_words = len(words_in_glove) >= 2

# If valid, continue to compute similarity and plot
if not enough_words:
    print("Not enough valid words found in GloVe to plot the heatmap")
else:
    # Get GloVe vector embeddings for the selected words
    word_embeddings = np.array([glove_vectors[word] for word in words_in_glove])

    # Compute pairwise cosine similarity
    similarity_matrix = cosine_similarity(word_embeddings)

    # Plot heatmap of cosine similarity
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        similarity_matrix,
        xticklabels=words_in_glove,
        yticklabels=words_in_glove,
        cmap='coolwarm',
        annot=True,
        fmt=".2f"
    )
    plt.title(f"Cosine Similarity Heatmap of Top {len(words_in_glove)} Frequent Words")
    plt.tight_layout()
    plt.show()
