In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import umap
import pandas as pd
from typing import Dict, List, Tuple
import re

# Load the Gemma model and tokenizer
print("Loading Gemma model...")
model_name = "google/gemma-2-2b"  # Using Gemma-2 2B model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Get the token embedding matrix
embedding_matrix = model.embed_tokens.weight  # Shape: [vocab_size, hidden_size]
print(f"Embedding matrix shape: {embedding_matrix.shape}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Hidden size: {model.config.hidden_size}")


In [None]:
def hex_to_rgb(hex_color: str) -> List[int]:
    """Convert hex color to RGB values."""
    # Remove # if present
    hex_color = hex_color.lstrip('#')
    # Convert to RGB
    return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]

def parse_colors_file(file_path: str) -> Dict[str, List[int]]:
    """Parse colors.txt file and return dict of color names to RGB values."""
    colors_dict = {}
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line and '\t' in line:
                # Split by tab
                parts = line.split('\t')
                if len(parts) >= 2:
                    color_name = parts[0].strip()
                    hex_color = parts[1].strip()
                    # Convert hex to RGB
                    rgb_values = hex_to_rgb(hex_color)
                    colors_dict[color_name] = rgb_values
    
    return colors_dict

# Parse the colors file
colors_file_path = "/users/sboppana/data/sboppana/multimodal_concept_learning/random_experiments/multi_token_embedding/colors.txt"
colors_dict = parse_colors_file(colors_file_path)

print(f"Parsed {len(colors_dict)} colors")
print("Sample colors:")
for i, (name, rgb) in enumerate(list(colors_dict.items())[:5]):
    print(f"  {name}: RGB{rgb}")


Parsed 949 colors
Sample colors:
  cloudy blue: RGB[172, 194, 217]
  dark pastel green: RGB[86, 174, 87]
  dust: RGB[178, 153, 110]
  electric lime: RGB[168, 255, 4]
  fresh green: RGB[105, 216, 79]


In [None]:
def get_averaged_embedding(text: str, tokenizer, embedding_matrix: torch.Tensor) -> torch.Tensor:
    """Get averaged token embedding for a text string."""
    # Tokenize the text
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # Get embeddings for each token
    token_embeddings = embedding_matrix[tokens]  # Shape: [num_tokens, hidden_size]
    
    # Average across tokens
    averaged_embedding = torch.mean(token_embeddings, dim=0)
    
    return averaged_embedding

# Compute averaged embeddings for all colors
color_embeddings = {}
print("Computing averaged embeddings for color names...")

for i, (color_name, rgb_values) in enumerate(colors_dict.items()):
    if i % 100 == 0:
        print(f"Processing color {i+1}/{len(colors_dict)}: {color_name}")
    
    # Get averaged embedding for this color name
    embedding = get_averaged_embedding(color_name, tokenizer, embedding_matrix)
    color_embeddings[color_name] = embedding

print(f"Computed embeddings for {len(color_embeddings)} colors")
print(f"Embedding dimension: {list(color_embeddings.values())[0].shape[0]}")


Computing averaged embeddings for color names...
Processing color 1/949: cloudy blue
Processing color 101/949: burnt siena
Processing color 201/949: pinky
Processing color 301/949: medium pink
Processing color 401/949: deep orange
Processing color 501/949: muted green
Processing color 601/949: dirt
Processing color 701/949: pastel pink
Processing color 801/949: powder blue
Processing color 901/949: rose
Computed embeddings for 949 colors
Embedding dimension: 2304


In [None]:
# Prepare data for PCA
embeddings_list = []
color_names_list = []
rgb_values_list = []

for color_name, embedding in color_embeddings.items():
    embeddings_list.append(embedding.detach().numpy())
    color_names_list.append(color_name)
    rgb_values_list.append(colors_dict[color_name])

# Convert to numpy arrays
embeddings_array = np.array(embeddings_list)
rgb_array = np.array(rgb_values_list)

# Normalize embeddings to unit norm
embeddings_array = embeddings_array / np.linalg.norm(embeddings_array, axis=1, keepdims=True)

print(f"Embeddings array shape: {embeddings_array.shape}")
print(f"RGB array shape: {rgb_array.shape}")
print("Embeddings normalized to unit norm")

# Perform PCA to reduce to 3D
pca = PCA(n_components=3)
pca_result = pca.fit_transform(embeddings_array)

print(f"PCA result shape: {pca_result.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.3f}")

# Create DataFrame for plotting
df = pd.DataFrame({
    'x': pca_result[:, 0],
    'y': pca_result[:, 1], 
    'z': pca_result[:, 2],
    'color_name': color_names_list,
    'r': rgb_array[:, 0],
    'g': rgb_array[:, 1],
    'b': rgb_array[:, 2]
})

# Create RGB color strings for plotly
df['rgb_color'] = df.apply(lambda row: f'rgb({row["r"]}, {row["g"]}, {row["b"]})', axis=1)

# Create 3D scatter plot
fig = go.Figure(data=go.Scatter3d(
    x=df['x'],
    y=df['y'],
    z=df['z'],
    mode='markers',
    marker=dict(
        size=3,
        color=df['rgb_color'],
        opacity=0.8,
        line=dict(width=0.5, color='black')
    ),
    text=df['color_name'],
    hovertemplate='<b>%{text}</b><br>' +
                  'PCA: (%{x:.2f}, %{y:.2f}, %{z:.2f})<br>' +
                  'RGB: (%{customdata[0]}, %{customdata[1]}, %{customdata[2]})<br>' +
                  '<extra></extra>',
    customdata=df[['r', 'g', 'b']]
))

# Update layout
fig.update_layout(
    title='3D PCA of Color Name Embeddings (Colored by RGB Values)',
    scene=dict(
        xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
        yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
        zaxis_title=f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)',
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)
        )
    ),
    width=800,
    height=600
)

# Save the plot to HTML file
output_file = "/users/sboppana/data/sboppana/multimodal_concept_learning/random_experiments/multi_token_embedding/color_embeddings_pca.html"
fig.write_html(output_file)

print("3D PCA visualization created successfully!")
print(f"Plot saved to: {output_file}")
print(f"Total explained variance by first 3 components: {sum(pca.explained_variance_ratio_):.3f}")


Embeddings array shape: (949, 2304)
RGB array shape: (949, 3)
Embeddings normalized to unit norm
PCA result shape: (949, 3)
Explained variance ratio: [0.10126032 0.06294682 0.05138734]
Total explained variance: 0.216
3D PCA visualization created successfully!
Plot saved to: /users/sboppana/data/sboppana/multimodal_concept_learning/random_experiments/multi_token_embedding/color_embeddings_pca.html
Total explained variance by first 3 components: 0.216


In [None]:
# Create UMAP visualization
print("Computing UMAP embedding...")
umap_reducer = umap.UMAP(n_components=3, random_state=42, n_neighbors=15, min_dist=0.1)
umap_result = umap_reducer.fit_transform(embeddings_array)

print(f"UMAP result shape: {umap_result.shape}")

# Create DataFrame for UMAP plotting
df_umap = pd.DataFrame({
    'x': umap_result[:, 0],
    'y': umap_result[:, 1], 
    'z': umap_result[:, 2],
    'color_name': color_names_list,
    'r': rgb_array[:, 0],
    'g': rgb_array[:, 1],
    'b': rgb_array[:, 2]
})

# Create RGB color strings for plotly
df_umap['rgb_color'] = df_umap.apply(lambda row: f'rgb({row["r"]}, {row["g"]}, {row["b"]})', axis=1)

# Create 3D UMAP scatter plot
fig_umap = go.Figure(data=go.Scatter3d(
    x=df_umap['x'],
    y=df_umap['y'],
    z=df_umap['z'],
    mode='markers',
    marker=dict(
        size=3,
        color=df_umap['rgb_color'],
        opacity=0.8,
        line=dict(width=0.5, color='black')
    ),
    text=df_umap['color_name'],
    hovertemplate='<b>%{text}</b><br>' +
                  'UMAP: (%{x:.2f}, %{y:.2f}, %{z:.2f})<br>' +
                  'RGB: (%{customdata[0]}, %{customdata[1]}, %{customdata[2]})<br>' +
                  '<extra></extra>',
    customdata=df_umap[['r', 'g', 'b']]
))

# Update layout for UMAP
fig_umap.update_layout(
    title='3D UMAP of Color Name Embeddings (Colored by RGB Values)',
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3',
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)
        )
    ),
    width=800,
    height=600
)

# Save the UMAP plot to HTML file
umap_output_file = "/users/sboppana/data/sboppana/multimodal_concept_learning/random_experiments/multi_token_embedding/color_embeddings_umap.html"
fig_umap.write_html(umap_output_file)

print("3D UMAP visualization created successfully!")
print(f"UMAP plot saved to: {umap_output_file}")


Computing UMAP embedding...


NameError: name 'umap' is not defined