In [None]:
import json
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
print("‚úì Libraries imported successfully!")

In [None]:
# Load training data
with open('data/train.json', 'r', encoding='utf-8') as f:
    resumes_data = json.load(f)

print(f"‚úì Loaded {len(resumes_data)} resumes")
print(f"‚úì Ready for training!")

In [None]:
print("[1/5] Extracting skills from dataset...")

extracted_skills = []

for resume in resumes_data:
    text = resume.get('text', '')
    annotations = resume.get('annotations', [])
    
    for ann in annotations:
        if len(ann) >= 3 and ann[2] == 'SKILL':
            start, end = ann[0], ann[1]
            skill_text = text[start:end].strip().lower()
            if skill_text and len(skill_text) > 1:
                extracted_skills.append(skill_text)

# Get unique skills
unique_extracted_skills = sorted(set(extracted_skills))

print(f"‚úì Extracted {len(extracted_skills)} total skill mentions")
print(f"‚úì Found {len(unique_extracted_skills)} unique skills")
print(f"\nSample skills: {unique_extracted_skills[:20]}")

In [None]:
print("[2/5] Building comprehensive skill database...")

# Manual list of important tech skills
manual_tech_skills = [
    # Programming Languages
    'python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php', 'swift',
    'kotlin', 'go', 'rust', 'typescript', 'scala', 'r', 'matlab', 'perl',
    
    # Web Technologies
    'html', 'css', 'react', 'angular', 'vue', 'nodejs', 'django', 'flask',
    'spring', 'express', 'jquery', 'bootstrap', 'tailwind',
    
    # Databases
    'sql', 'mysql', 'postgresql', 'mongodb', 'oracle', 'redis', 'cassandra',
    'sqlite', 'mariadb', 'dynamodb', 'elasticsearch',
    
    # Cloud & DevOps
    'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'terraform',
    'ansible', 'ci/cd', 'travis ci', 'circleci', 'gitlab',
    
    # Data Science & ML
    'machine learning', 'deep learning', 'data analysis', 'pandas', 'numpy',
    'tensorflow', 'pytorch', 'scikit-learn', 'keras', 'opencv',
    'tableau', 'power bi', 'spark', 'hadoop',
    
    # Tools & Methodologies
    'git', 'github', 'jira', 'confluence', 'linux', 'unix', 'agile',
    'scrum', 'rest api', 'graphql', 'microservices', 'testing'
]

# Combine extracted and manual skills
complete_skill_database = sorted(set(unique_extracted_skills + manual_tech_skills))

print(f"‚úì Manual skills added: {len(manual_tech_skills)}")
print(f"‚úì Total unique skills in database: {len(complete_skill_database)}")

In [None]:
print("[3/5] Training TF-IDF vectorizer...")

# Extract all resume texts
resume_texts = [resume.get('text', '') for resume in resumes_data]

# Initialize and train TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',       
    ngram_range=(1, 2),         
    max_features=1000,        
    min_df=2,                  
    max_df=0.8                
)

# Fit the vectorizer on resume texts
vectorizer.fit(resume_texts)

print(f"‚úì TF-IDF vectorizer trained on {len(resume_texts)} resumes")
print(f"‚úì Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"\nTop 20 TF-IDF terms: {list(vectorizer.vocabulary_.keys())[:20]}")

In [None]:
print("[4/5] Saving trained models...")

# Create models directory
os.makedirs('models', exist_ok=True)

# Package everything together
model_package = {
    'vectorizer': vectorizer,
    'skill_database': complete_skill_database,
    'training_stats': {
        'total_resumes': len(resumes_data),
        'total_skills': len(complete_skill_database),
        'extracted_skills': len(unique_extracted_skills),
        'manual_skills': len(manual_tech_skills),
        'vocabulary_size': len(vectorizer.vocabulary_)
    }
}

# Save using pickle
with open('models/resume_scanner_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("‚úì Model saved to: models/resume_scanner_model.pkl")
print(f"‚úì File size: {os.path.getsize('models/resume_scanner_model.pkl') / 1024:.2f} KB")

In [None]:
print("[5/5] Saving skill database...")

# Save skill database
with open('models/skill_database.txt', 'w', encoding='utf-8') as f:
    f.write("COMPLETE SKILL DATABASE\n")
    f.write("=" * 50 + "\n\n")
    for i, skill in enumerate(complete_skill_database, 1):
        f.write(f"{i:4d}. {skill}\n")

print("‚úì Skill database saved to: models/skill_database.txt")

# Also save as JSON
with open('models/skill_database.json', 'w', encoding='utf-8') as f:
    json.dump(complete_skill_database, f, indent=2, ensure_ascii=False)

print("‚úì Skill database saved to: models/skill_database.json")

In [None]:
print("Verifying model loading...")

# Try to load the model
with open('models/resume_scanner_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

print("‚úì Model loaded successfully!")
print("\nModel contents:")
print(f"  - Vectorizer: {type(loaded_model['vectorizer']).__name__}")
print(f"  - Skills in database: {len(loaded_model['skill_database'])}")
print(f"  - Training stats: {loaded_model['training_stats']}")

# Test vectorizer
test_text = "Python developer with 5 years experience in machine learning and AWS"
test_vector = loaded_model['vectorizer'].transform([test_text])
print(f"\n‚úì Vectorizer test passed! Shape: {test_vector.shape}")

In [None]:
print("=" * 70)
print("TRAINING COMPLETE!")
print("=" * 70)

stats = loaded_model['training_stats']

print(f"\nüìä TRAINING STATISTICS")
print(f"  Total Resumes Processed: {stats['total_resumes']:,}")
print(f"  Skills Extracted from Data: {stats['extracted_skills']:,}")
print(f"  Manual Skills Added: {stats['manual_skills']:,}")
print(f"  Total Skills in Database: {stats['total_skills']:,}")
print(f"  TF-IDF Vocabulary Size: {stats['vocabulary_size']:,}")

print(f"\nüìÅ OUTPUT FILES")
print(f"  ‚úì models/resume_scanner_model.pkl")
print(f"  ‚úì models/skill_database.txt")
print(f"  ‚úì models/skill_database.json")

print(f"\nüéØ WHAT THIS MODEL CAN DO")
print(f"  ‚úì Extract skills from any resume")
print(f"  ‚úì Calculate text similarity with job descriptions")
print(f"  ‚úì Match resume skills with job requirements")
print(f"  ‚úì Generate ATS compatibility scores")

print(f"\n" + "=" * 70)
print("Next: Run notebook '03_Resume_Analyzer.ipynb' to use the model!")
print("=" * 70)