In [None]:
!pip install tqdm


In [None]:
# Cell 1: Import libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import os

# Settings
%matplotlib inline
pd.set_option('display.max_columns', None)

print(" Basic libraries loaded!")
print(f" Working directory: {os.getcwd()}")

In [None]:
!pip install transformers torch sentence-transformers tqdm


In [None]:
# Cell 3: Import ML libraries
print(" Loading ML libraries...")

from transformers import AutoTokenizer, AutoModel
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

print(f" PyTorch version: {torch.__version__}")
print(f" CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"    GPU: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print(f"    Using CPU ")
    device = 'cpu'

print(f"\n  Computing device: {device}")

In [None]:
# Cell 4: Load preprocessed papers
print(" Loading preprocessed papers...\n")

# Load the 10K sample we created in Step 2
papers = []
with open('data/processed/papers_sample_10k.json', 'r') as f:
    for line in f:
        papers.append(json.loads(line))

df = pd.DataFrame(papers)

print(f" Loaded {len(df):,} papers")
print(f"\n Dataset info:")
print(f"   Columns: {len(df.columns)}")
print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Show sample
print(f"\n Sample paper:")
sample = df.iloc[0]
print(f"   ID: {sample['id']}")
print(f"   Title: {sample['title_clean'][:80]}...")
print(f"   Abstract length: {len(sample['abstract_clean'])} chars")
print(f"   Year: {sample['year']}")
print(f"   Categories: {sample['categories']}")

In [None]:
# Cell 5: Load embedding model
print(" Loading SentenceTransformer model...")

# Using fast, efficient model for learning
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

print(f" Model loaded: {model_name}")
print(f"   Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f"   Max sequence length: {model.max_seq_length} tokens")

# Test with one paper
print(f"\n Testing model...")
test_text = df.iloc[0]['title_clean'] + " " + df.iloc[0]['abstract_clean']
test_embedding = model.encode(test_text, show_progress_bar=False)

print(f"   Input text length: {len(test_text)} characters")
print(f"   Output embedding shape: {test_embedding.shape}")
print(f"   Sample values: {test_embedding[:5]}")


In [None]:
# Cell 6: Create function to generate embeddings
def generate_paper_embedding(paper, model, max_chars=2000):
    """
    Generate embedding for a single paper.
    Combines title + abstract.
    
    Args:
        paper: Dictionary with 'title_clean' and 'abstract_clean'
        model: SentenceTransformer model
        max_chars: Maximum characters to process
    
    Returns:
        numpy array: Embedding vector
    """
    # Combine title and abstract
    title = paper.get('title_clean', '')
    abstract = paper.get('abstract_clean', '')
    text = f"{title} {abstract}"
    
    # Truncate if too long
    if len(text) > max_chars:
        text = text[:max_chars]
    
    # Generate embedding
    embedding = model.encode(text, show_progress_bar=False)
    
    return embedding

# Test the function
print(" Testing embedding generation function...\n")

test_paper = df.iloc[0]
test_emb = generate_paper_embedding(test_paper, model)

print(f"   Function test successful!")
print(f"   Input paper: {test_paper['id']}")
print(f"   Input title: {test_paper['title_clean'][:60]}...")
print(f"   Output shape: {test_emb.shape}")
print(f"   Output type: {type(test_emb)}")
print(f"   Output range: [{test_emb.min():.3f}, {test_emb.max():.3f}]")

In [None]:
# Cell 7: Generate embeddings for all papers
print(" GENERATING EMBEDDINGS FOR ALL PAPERS")
print("=" * 60)
print(f"Total papers: {len(df):,}")
print(f"Estimated time: ~{len(df)/10:.0f}-{len(df)/8:.0f} minutes")
print("=" * 60)

# Prepare batch processing
batch_size = 32
all_embeddings = []
all_paper_ids = []

# Prepare all texts for batch processing
texts = []
for idx, paper in df.iterrows():
    text = f"{paper['title_clean']} {paper['abstract_clean']}"
    # Truncate if needed
    if len(text) > 2000:
        text = text[:2000]
    texts.append(text)
    all_paper_ids.append(paper['id'])

# Generate embeddings in batches (much faster!)
print("Processing in batches...")
for i in tqdm(range(0, len(texts), batch_size), desc="Batches"):
    batch_texts = texts[i:i + batch_size]
    batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
    all_embeddings.extend(batch_embeddings)

# Convert to numpy array
embeddings = np.array(all_embeddings)

print(f"\n" + "=" * 60)
print(f" EMBEDDINGS GENERATED SUCCESSFULLY!")
print(f"=" * 60)
print(f"   Shape: {embeddings.shape}")
print(f"   Data type: {embeddings.dtype}")
print(f"   Memory size: {embeddings.nbytes / (1024**2):.2f} MB")
print(f"   Value range: [{embeddings.min():.3f}, {embeddings.max():.3f}]")

In [None]:
# Cell 8: Save embeddings to disk

# Create embeddings directory
os.makedirs('data/embeddings', exist_ok=True)

# Save embeddings as numpy array
embeddings_file = 'data/embeddings/paper_embeddings_10k.npy'
np.save(embeddings_file, embeddings)
print(f" Saved: {embeddings_file}")
print(f"   Size: {os.path.getsize(embeddings_file) / (1024**2):.2f} MB")

# Save paper IDs (to map embeddings back to papers)
ids_file = 'data/embeddings/paper_ids_10k.json'
with open(ids_file, 'w') as f:
    json.dump(all_paper_ids, f)
print(f" Saved: {ids_file}")

# Save metadata
metadata = {
    'model_name': model_name,
    'num_papers': len(embeddings),
    'embedding_dimension': embeddings.shape[1],
    'date_created': pd.Timestamp.now().isoformat(),
    'batch_size': batch_size
}

metadata_file = 'data/embeddings/metadata_10k.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f" Saved: {metadata_file}")

print(f"\n All files saved successfully!")

In [None]:
# Cell 9: Test similarity search
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_papers(query_idx, embeddings, df, top_k=5):
    """
    Find papers most similar to the query paper.
    
    Args:
        query_idx: Index of query paper
        embeddings: All paper embeddings
        df: DataFrame with paper info
        top_k: Number of similar papers to return
    
    Returns:
        List of similar papers with similarity scores
    """
    # Get query embedding
    query_embedding = embeddings[query_idx].reshape(1, -1)
    
    # Calculate cosine similarity with all papers
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top k most similar (excluding the query paper itself)
    similar_indices = np.argsort(similarities)[::-1][1:top_k+1]
    
    # Prepare results
    results = []
    for idx in similar_indices:
        results.append({
            'index': int(idx),
            'paper_id': df.iloc[idx]['id'],
            'title': df.iloc[idx]['title_clean'],
            'similarity': float(similarities[idx]),
            'year': int(df.iloc[idx]['year']),
            'categories': df.iloc[idx]['categories']
        })
    
    return results

# Test with a paper about machine learning
print("=" * 80)
print(" SIMILARITY SEARCH TEST")
print("=" * 80)

# Pick a test paper (you can change this index!)
test_idx = 42
query_paper = df.iloc[test_idx]

print(f"\nðŸ“„ QUERY PAPER #{test_idx}:")
print(f"   ID: {query_paper['id']}")
print(f"   Title: {query_paper['title_clean']}")
print(f"   Year: {query_paper['year']}")
print(f"   Categories: {query_paper['categories']}")
print(f"   Abstract: {query_paper['abstract_clean'][:250]}...")

# Find similar papers
similar_papers = find_similar_papers(test_idx, embeddings, df, top_k=5)

print(f"\n TOP 5 MOST SIMILAR PAPERS:\n")

for i, paper in enumerate(similar_papers, 1):
    print(f"{i}. [{paper['similarity']:.3f}] {paper['title']}")
    print(f"   Year: {paper['year']} | Categories: {paper['categories']}")
    print()

print("=" * 80)
print(" Similarity ranges from 0 (unrelated) to 1 (identical)")
print("   Scores > 0.7 = Very similar")
print("   Scores 0.5-0.7 = Somewhat similar") 
print("   Scores < 0.5 = Different topics")

In [None]:
# Cell 10: Analyze overall similarity patterns
print(" ANALYZING SIMILARITY PATTERNS\n")

# Calculate similarities for a random sample
sample_size = 100
sample_indices = np.random.choice(len(embeddings), sample_size, replace=False)

all_similarities = []
for idx in tqdm(sample_indices, desc="Calculating similarities"):
    query_emb = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_emb, embeddings)[0]
    # Exclude self-similarity (which is always 1.0)
    sims = sims[sims < 0.999]
    all_similarities.extend(sims)

all_similarities = np.array(all_similarities)

print(f"\nðŸ“ˆ Similarity Statistics (from {sample_size} papers):")
print(f"   Mean: {all_similarities.mean():.3f}")
print(f"   Median: {np.median(all_similarities):.3f}")
print(f"   Std: {all_similarities.std():.3f}")
print(f"   Min: {all_similarities.min():.3f}")
print(f"   Max: {all_similarities.max():.3f}")

# Plot distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(all_similarities, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(all_similarities.mean(), color='red', linestyle='--', 
            linewidth=2, label=f'Mean: {all_similarities.mean():.3f}')
plt.xlabel('Cosine Similarity', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Paper Similarities', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(all_similarities, vert=True)
plt.ylabel('Cosine Similarity', fontsize=12)
plt.title('Similarity Boxplot', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Interpretation
print(f"\n Interpretation:")
if all_similarities.mean() > 0.3:
    print(f"     High average similarity - papers might be too similar")
elif all_similarities.mean() < 0.1:
    print(f"     Low average similarity - embeddings might need tuning")
else:
    print(f"    Good distribution - embeddings capture meaningful differences")

In [None]:
# Cell 11: Visualize embeddings in 2D space
from sklearn.manifold import TSNE

print(" Creating 2D visualization of embeddings...")

# Use subset for visualization (1000 papers)
viz_sample_size = 1000
viz_indices = np.random.choice(len(embeddings), viz_sample_size, replace=False)
embeddings_sample = embeddings[viz_indices]
df_viz = df.iloc[viz_indices].reset_index(drop=True)

# Reduce dimensions with t-SNE
print("   Running t-SNE dimensionality reduction...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
embeddings_2d = tsne.fit_transform(embeddings_sample)

print("   Creating visualization...")

# Extract main category for coloring
df_viz['main_category'] = df_viz['categories'].apply(
    lambda x: x.split()[0] if isinstance(x, str) else 'unknown'
)

# Get top 6 categories
top_cats = df_viz['main_category'].value_counts().head(6).index.tolist()
df_viz['plot_category'] = df_viz['main_category'].apply(
    lambda x: x if x in top_cats else 'Other'
)

# Create beautiful scatter plot
plt.figure(figsize=(16, 10))

categories = df_viz['plot_category'].unique()
colors = plt.cm.Set3(np.linspace(0, 1, len(categories)))

for category, color in zip(categories, colors):
    mask = df_viz['plot_category'] == category
    plt.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=[color],
        label=category,
        alpha=0.6,
        s=50,
        edgecolors='black',
        linewidth=0.5
    )

plt.title('Research Paper Embeddings Visualization (t-SNE Projection)', 
          fontsize=18, fontweight='bold', pad=20)
plt.xlabel('t-SNE Dimension 1', fontsize=14)
plt.ylabel('t-SNE Dimension 2', fontsize=14)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), 
           loc='upper left', fontsize=12, title_fontsize=13)
plt.grid(True, alpha=0.2)
plt.tight_layout()
plt.show()

print("\n Visualization complete!")
print("   - Each dot = one paper")
print("   - Close dots = similar content")
print("   - Colors = paper categories")
print("   - how papers cluster by topic!")

In [None]:
# Cell 12: Analyze how well categories cluster
print(" CATEGORY CLUSTERING ANALYSIS")
print("=" * 60)

from collections import defaultdict

# Sample papers from each major category
major_categories = df['categories'].apply(lambda x: x.split()[0]).value_counts().head(5).index

category_quality = {}

for category in tqdm(major_categories, desc="Analyzing categories"):
    # Get papers in this category
    cat_papers = df[df['categories'].str.startswith(category)]
    
    if len(cat_papers) < 10:
        continue
    
    # Sample 20 papers from this category
    sample_size = min(20, len(cat_papers))
    sampled = cat_papers.sample(sample_size, random_state=42)
    
    # For each paper, find its 10 nearest neighbors
    same_category_counts = []
    
    for idx in sampled.index:
        paper_idx = df.index.get_loc(idx)
        similar = find_similar_papers(paper_idx, embeddings, df, top_k=10)
        
        # Count how many are in the same category
        same_cat = sum(1 for s in similar if s['categories'].startswith(category))
        same_category_counts.append(same_cat / 10)
    
    # Average precision for this category
    avg_precision = np.mean(same_category_counts)
    category_quality[category] = avg_precision

# Display results
print(f"\nðŸ“Š Category Clustering Quality:")
print(f"   (Higher = better category separation)\n")

for cat, score in sorted(category_quality.items(), key=lambda x: x[1], reverse=True):
    bar = 'â–ˆ' * int(score * 20)
    print(f"   {cat:20} : {score:.2%} {bar}")

overall_quality = np.mean(list(category_quality.values()))
print(f"\n   Overall Average: {overall_quality:.2%}")

if overall_quality > 0.6:
    print(f"\n    Excellent! Embeddings group similar papers well!")
elif overall_quality > 0.4:
    print(f"\n    Good! Embeddings are working correctly!")
else:
    print(f"\n     Moderate - but okay for learning purposes!")

In [None]:
# Cell 13: Complete summary

print(f"\n Final Statistics:")
print(f"   Papers processed: {len(embeddings):,}")
print(f"   Embedding dimension: {embeddings.shape[1]}")
print(f"   Model used: {model_name}")
print(f"   Total embeddings size: {embeddings.nbytes / (1024**2):.2f} MB")
print(f"   Average similarity: {all_similarities.mean():.3f}")
print(f"   Category clustering: {overall_quality:.2%}")

print(f"\n Files Created:")
print(f"    data/embeddings/paper_embeddings_10k.npy ({embeddings.nbytes / (1024**2):.2f} MB)")
print(f"    data/embeddings/paper_ids_10k.json")
print(f"    data/embeddings/metadata_10k.json")

# Quick sanity check
test_idx = np.random.randint(0, len(df))
test_paper = df.iloc[test_idx]
similar = find_similar_papers(test_idx, embeddings, df, top_k=3)

print(f"\n   Query: {test_paper['title_clean'][:60]}...")
print(f"   Top match: {similar[0]['title'][:60]}...")
print(f"   Similarity: {similar[0]['similarity']:.3f}")

if similar[0]['similarity'] > 0.5:
    print(f"\n    Embeddings are working correctly!")
else:
    print(f"\n     Lower similarity - but still usable!")