# Resume Processing with Sentence Transformers (BERT)
This notebook processes resume data from UpdatedResumeDataSet.csv using sentence-transformers to generate embeddings.

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from datetime import datetime
import pickle
import os

## Load Raw Data

In [None]:
# Load the raw resume dataset
raw_data = pd.read_csv('UpdatedResumeDataSet.csv')
print(f"Total resumes: {len(raw_data)}")
print(f"\nColumns: {raw_data.columns.tolist()}")
print(f"\nCategory distribution:")
print(raw_data['Category'].value_counts())

In [None]:
# Display sample data
raw_data.head()

## Initialize Sentence Transformer Model

In [None]:
# Load the sentence transformer model (same as used in backend)
print("Loading sentence-transformers model: all-MiniLM-L6-v2")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print(f"Model loaded. Embedding dimension: {model.get_sentence_embedding_dimension()}")

## Text Preprocessing

In [None]:
def clean_text(text):
    """Clean resume text for better embedding quality"""
    if not isinstance(text, str):
        return ""
    
    # Replace newlines with spaces
    text = text.replace('\n', ' ').replace('\\n', ' ')
    
    # Remove multiple spaces
    text = ' '.join(text.split())
    
    return text.strip()

# Clean all resume texts
raw_data['Resume_Cleaned'] = raw_data['Resume'].apply(clean_text)

# Check for empty resumes
empty_resumes = raw_data[raw_data['Resume_Cleaned'] == '']
print(f"Empty resumes after cleaning: {len(empty_resumes)}")

# Show sample cleaned text
print(f"\nSample cleaned text (first 500 chars):\n{raw_data['Resume_Cleaned'][0][:500]}")

## Generate Embeddings

In [None]:
# Generate embeddings in batches for efficiency
print(f"Starting embedding generation at {datetime.now()}")
print("This may take a few minutes...\n")

batch_size = 32
embeddings_list = []

for i in range(0, len(raw_data), batch_size):
    batch_texts = raw_data['Resume_Cleaned'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
    embeddings_list.append(batch_embeddings)
    
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i + len(batch_texts)}/{len(raw_data)} resumes")

# Concatenate all embeddings
embeddings = np.vstack(embeddings_list)

print(f"\nCompleted embedding generation at {datetime.now()}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

## Create Processed DataFrame

In [None]:
# Create new dataframe with embeddings
# Store embeddings as separate columns for each dimension
embedding_dim = embeddings.shape[1]

# Create column names for embeddings
embedding_columns = [f'embedding_{i}' for i in range(embedding_dim)]

# Create dataframe with original data
processed_data = pd.DataFrame({
    'resume_id': range(len(raw_data)),
    'category': raw_data['Category'],
    'resume_text': raw_data['Resume_Cleaned']
})

# Add embedding columns
embedding_df = pd.DataFrame(embeddings, columns=embedding_columns)
processed_data = pd.concat([processed_data, embedding_df], axis=1)

print(f"Processed data shape: {processed_data.shape}")
print(f"Columns: {len(processed_data.columns)}")
print(f"  - Metadata columns: 3 (resume_id, category, resume_text)")
print(f"  - Embedding columns: {embedding_dim}")

processed_data.head()

## Data Statistics

In [None]:
# Basic statistics
print("Resume text length statistics:")
text_lengths = processed_data['resume_text'].str.len()
print(f"  Mean: {text_lengths.mean():.0f} characters")
print(f"  Median: {text_lengths.median():.0f} characters")
print(f"  Min: {text_lengths.min()} characters")
print(f"  Max: {text_lengths.max()} characters")

print(f"\nEmbedding statistics:")
embedding_cols = [col for col in processed_data.columns if col.startswith('embedding_')]
embedding_values = processed_data[embedding_cols].values
print(f"  Mean: {embedding_values.mean():.4f}")
print(f"  Std: {embedding_values.std():.4f}")
print(f"  Min: {embedding_values.min():.4f}")
print(f"  Max: {embedding_values.max():.4f}")

## Save Processed Data

In [None]:
# Save to CSV
output_csv = 'processed_resumes_with_embeddings.csv'
processed_data.to_csv(output_csv, index=False)
print(f"Saved processed data to: {output_csv}")
print(f"File size: {os.path.getsize(output_csv) / (1024*1024):.2f} MB")

In [None]:
# Save embeddings separately as numpy array for faster loading
output_npy = 'resume_embeddings.npy'
np.save(output_npy, embeddings)
print(f"Saved embeddings to: {output_npy}")
print(f"File size: {os.path.getsize(output_npy) / (1024*1024):.2f} MB")

In [None]:
# Save metadata separately (without embeddings)
metadata_df = processed_data[['resume_id', 'category', 'resume_text']].copy()
output_metadata = 'resume_metadata.csv'
metadata_df.to_csv(output_metadata, index=False)
print(f"Saved metadata to: {output_metadata}")
print(f"File size: {os.path.getsize(output_metadata) / (1024*1024):.2f} MB")

In [None]:
# Save complete data as pickle for easy loading
output_pickle = 'processed_resumes_complete.pkl'
with open(output_pickle, 'wb') as f:
    pickle.dump({
        'metadata': metadata_df,
        'embeddings': embeddings,
        'embedding_dim': embedding_dim,
        'model_name': 'sentence-transformers/all-MiniLM-L6-v2',
        'processed_date': datetime.now().isoformat()
    }, f)
print(f"Saved complete data to: {output_pickle}")
print(f"File size: {os.path.getsize(output_pickle) / (1024*1024):.2f} MB")

## Verify Saved Data

In [None]:
# Test loading the saved data
print("Testing data loading...\n")

# Load CSV
test_csv = pd.read_csv(output_csv)
print(f"CSV loaded: {test_csv.shape}")

# Load numpy embeddings
test_npy = np.load(output_npy)
print(f"NumPy embeddings loaded: {test_npy.shape}")

# Load pickle
with open(output_pickle, 'rb') as f:
    test_pickle = pickle.load(f)
print(f"Pickle loaded:")
print(f"  - Metadata: {test_pickle['metadata'].shape}")
print(f"  - Embeddings: {test_pickle['embeddings'].shape}")
print(f"  - Model: {test_pickle['model_name']}")
print(f"  - Processed: {test_pickle['processed_date']}")

print("\nâœ“ All files saved and verified successfully!")

## Summary

In [None]:
print("=" * 60)
print("PROCESSING SUMMARY")
print("=" * 60)
print(f"Total resumes processed: {len(processed_data)}")
print(f"Embedding model: sentence-transformers/all-MiniLM-L6-v2")
print(f"Embedding dimension: {embedding_dim}")
print(f"Categories: {processed_data['category'].nunique()}")
print(f"\nOutput files:")
print(f"  1. {output_csv} - Full data with embeddings")
print(f"  2. {output_npy} - Embeddings only (NumPy array)")
print(f"  3. {output_metadata} - Metadata only (no embeddings)")
print(f"  4. {output_pickle} - Complete data (Python pickle)")
print("=" * 60)