# Notebook 6: MAYINI Job Classifier
## ML-based job relevance scoring and filtering

**Purpose**: Classify jobs by relevance using neural network

**Can run independently?** ✅ YES (5 min)


## Installation & Setup

In [None]:
!pip install torch numpy pandas scikit-learn -q
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from datetime import datetime
import json
import pickle

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✓ Setup complete. Using device: {DEVICE}")

## Job Embedder (Feature Extraction)

In [None]:
class JobEmbedder:
    """
    Convert job postings into feature vectors.
    Creates 300-dimensional embeddings for neural network.
    """
    
    def __init__(self):
        # Vocabulary for encoding
        self.tech_stack = ['Python', 'Java', 'JavaScript', 'React', 'Docker', 'AWS',
                          'Kubernetes', 'SQL', 'MongoDB', 'TensorFlow', 'PyTorch']
        
        self.seniority_levels = {'entry': 0, 'junior': 1, 'mid': 2, 'senior': 3, 'lead': 4}
        self.job_types = {'full-time': 0, 'part-time': 1, 'contract': 2, 'remote': 3}
    
    def embed_job(self, job: Dict) -> np.ndarray:
        """
        Convert job to embedding vector.
        
        Args:
            job: Job dictionary
        
        Returns:
            300-dimensional embedding
        """
        embedding = []
        
        # 1. Tech stack encoding (50 dims)
        tech_vector = self._encode_tech_stack(job.get('skills', []))
        embedding.extend(tech_vector[:50])
        
        # 2. Seniority level encoding (10 dims)
        seniority_vector = self._encode_seniority(job.get('seniority_level', ''))
        embedding.extend(seniority_vector)
        
        # 3. Years of experience encoding (10 dims)
        experience_vector = self._encode_experience(job.get('requirements', []))
        embedding.extend(experience_vector)
        
        # 4. Job type encoding (10 dims)
        job_type_vector = self._encode_job_type(job.get('job_type', ''))
        embedding.extend(job_type_vector)
        
        # 5. Salary encoding (10 dims)
        salary_vector = self._encode_salary(job.get('salary', ''))
        embedding.extend(salary_vector)
        
        # 6. Text features (210 dims - description embedding)
        text_vector = self._encode_text(job.get('description', ''))
        embedding.extend(text_vector[:210])
        
        # Pad to 300 dimensions
        embedding = np.array(embedding[:300])
        if len(embedding) < 300:
            embedding = np.pad(embedding, (0, 300 - len(embedding)), mode='constant')
        
        return embedding.astype(np.float32)
    
    def _encode_tech_stack(self, skills: List[str]) -> List[float]:
        """Encode technical skills as one-hot vectors"""
        vector = [0.0] * 50
        for i, tech in enumerate(self.tech_stack):
            if tech.lower() in [s.lower() for s in skills]:
                vector[i % 50] = 1.0
        return vector
    
    def _encode_seniority(self, level: str) -> List[float]:
        """Encode seniority level"""
        vector = [0.0] * 10
        level_lower = level.lower()
        for key, val in self.seniority_levels.items():
            if key in level_lower:
                vector[val] = 1.0
        return vector
    
    def _encode_experience(self, requirements: List[str]) -> List[float]:
        """Encode years of experience requirement"""
        vector = [0.0] * 10
        text = ' '.join(requirements).lower()
        
        import re
        match = re.search(r'(\d+)\+?\s*(?:years?|yrs?)', text)
        if match:
            years = min(int(match.group(1)), 9)
            vector[years] = 1.0
        return vector
    
    def _encode_job_type(self, job_type: str) -> List[float]:
        """Encode job type"""
        vector = [0.0] * 10
        job_type_lower = job_type.lower()
        for key, val in self.job_types.items():
            if key in job_type_lower:
                vector[val] = 1.0
        return vector
    
    def _encode_salary(self, salary: str) -> List[float]:
        """Encode salary range"""
        vector = [0.0] * 10
        
        import re
        matches = re.findall(r'\$(\d+[,\d]*)', salary)
        if matches:
            try:
                salary_val = int(matches[0].replace(',', ''))
                # Normalize to 0-10 scale (0-1M)
                normalized = min(salary_val / 100000, 9)
                vector[int(normalized)] = 1.0
            except:
                pass
        return vector
    
    def _encode_text(self, text: str) -> List[float]:
        """Simple text encoding using word frequency"""
        # Create simple TF vector
        words = text.lower().split()
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'was'}
        words = [w for w in words if w not in stop_words and len(w) > 3]
        
        from collections import Counter
        word_freq = Counter(words)
        
        # Get hash-based features
        vector = [0.0] * 210
        for word, count in word_freq.most_common(210):
            idx = hash(word) % 210
            vector[idx] = min(count / 10.0, 1.0)
        
        return vector

print("✓ JobEmbedder class created")

## MAYINI Classifier Neural Network

In [None]:
class MAYINIRelevanceClassifier(nn.Module):
    """
    Neural network for job relevance classification.
    Binary classifier: relevant (1) or not relevant (0)
    """
    
    def __init__(self, input_dim: int = 300, hidden_dim: int = 128):
        super().__init__()
        
        # Input layer
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        
        # Hidden layers
        self.fc2 = nn.Linear(hidden_dim, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.2)
        
        # Output layer
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through network.
        
        Args:
            x: Input tensor (batch_size, 300)
        
        Returns:
            Relevance score (batch_size, 1)
        """
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        x = self.sigmoid(x)
        
        return x

print("✓ MAYINI classifier network created")

## Job Relevance Classifier Wrapper

In [None]:
class JobRelevanceClassifier:
    """
    Main classifier for job relevance.
    Wraps embedder and neural network.
    """
    
    def __init__(self, device=DEVICE):
        self.embedder = JobEmbedder()
        self.model = MAYINIRelevanceClassifier(input_dim=300, hidden_dim=128).to(device)
        self.device = device
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.BCELoss()
        self.user_preferences = []  # Track user interactions
    
    def predict_relevance(self, job: Dict) -> float:
        """
        Predict relevance score for a job.
        
        Args:
            job: Job dictionary
        
        Returns:
            Relevance score (0-1)
        """
        self.model.eval()
        
        with torch.no_grad():
            # Embed job
            embedding = self.embedder.embed_job(job)
            embedding_tensor = torch.FloatTensor(embedding).unsqueeze(0).to(self.device)
            
            # Predict
            score = self.model(embedding_tensor)
            
            return float(score.item())
    
    def batch_classify_jobs(self, jobs: List[Dict], threshold: float = 0.7) -> Dict:
        """
        Classify multiple jobs.
        
        Args:
            jobs: List of job dictionaries
            threshold: Relevance threshold (0-1)
        
        Returns:
            Classification results
        """
        relevant = []
        irrelevant = []
        
        scores = []
        for job in jobs:
            score = self.predict_relevance(job)
            scores.append(score)
            
            item = {'job': job, 'relevance_score': score}
            
            if score >= threshold:
                relevant.append(item)
            else:
                irrelevant.append(item)
        
        return {
            'relevant': sorted(relevant, key=lambda x: x['relevance_score'], reverse=True),
            'irrelevant': sorted(irrelevant, key=lambda x: x['relevance_score'], reverse=True),
            'statistics': {
                'total_jobs': len(jobs),
                'relevant_count': len(relevant),
                'irrelevant_count': len(irrelevant),
                'pass_rate': len(relevant) / len(jobs) if jobs else 0,
                'avg_score': np.mean(scores),
                'min_score': min(scores) if scores else 0,
                'max_score': max(scores) if scores else 1,
            }
        }
    
    def train_on_user_history(self, job_applications: List[Dict], labels: List[int], epochs: int = 10):
        """
        Train classifier on user's job application history.
        
        Args:
            job_applications: List of jobs user applied to
            labels: 1 for relevant/applied, 0 for irrelevant
            epochs: Number of training epochs
        """
        self.model.train()
        
        for epoch in range(epochs):
            total_loss = 0
            
            for job, label in zip(job_applications, labels):
                # Embed
                embedding = self.embedder.embed_job(job)
                embedding_tensor = torch.FloatTensor(embedding).unsqueeze(0).to(self.device)
                label_tensor = torch.FloatTensor([label]).unsqueeze(0).to(self.device)
                
                # Forward
                output = self.model(embedding_tensor)
                loss = self.criterion(output, label_tensor)
                
                # Backward
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
            avg_loss = total_loss / len(job_applications) if job_applications else 0
            if (epoch + 1) % 5 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

print("✓ JobRelevanceClassifier wrapper created")

## Testing & Demo

In [None]:
# Load sample jobs
try:
    with open('/tmp/job_scraper_module.pkl', 'rb') as f:
        scraper_data = pickle.load(f)
    sample_jobs = scraper_data['sample_jobs']
except:
    print("Creating mock jobs...")
    sample_jobs = [
        {
            'title': 'Python Developer',
            'company': 'Tech Corp',
            'location': 'SF',
            'skills': ['Python', 'Docker', 'AWS'],
            'seniority_level': 'Senior',
            'job_type': 'Full-time',
            'salary': '$150K - $200K',
            'requirements': ['5+ years Python', 'Docker knowledge'],
            'description': 'Looking for senior Python developer...'
        }
    ]

print(f"✓ Loaded {len(sample_jobs)} sample jobs")

# Initialize classifier
classifier = JobRelevanceClassifier()
print("✓ Classifier initialized")

# Test 1: Predict single job
print("\nTest 1: Predicting relevance...")
for i, job in enumerate(sample_jobs[:3]):
    score = classifier.predict_relevance(job)
    print(f"  Job {i+1} ({job['title']}): {score:.3f}")

# Test 2: Batch classification
print("\nTest 2: Batch classification...")
results = classifier.batch_classify_jobs(sample_jobs, threshold=0.5)
print(f"  Total jobs: {results['statistics']['total_jobs']}")
print(f"  Relevant: {results['statistics']['relevant_count']}")
print(f"  Pass rate: {results['statistics']['pass_rate']:.1%}")
print(f"  Avg score: {results['statistics']['avg_score']:.3f}")

# Test 3: Train on user history
print("\nTest 3: Training on user history...")
training_jobs = sample_jobs[:5]
training_labels = [1, 1, 0, 1, 0]  # User's preferences
classifier.train_on_user_history(training_jobs, training_labels, epochs=10)

print("\n✅ All classifier tests passed!")

## Export Classifier

In [None]:
# Save classifier
classifier_data = {
    'JobRelevanceClassifier': JobRelevanceClassifier,
    'MAYINIRelevanceClassifier': MAYINIRelevanceClassifier,
    'JobEmbedder': JobEmbedder,
    'classifier_instance': classifier,
    'classification_results': results,
}

with open('/tmp/job_classifier_module.pkl', 'wb') as f:
    pickle.dump(classifier_data, f)

with open('/tmp/classification_results.json', 'w') as f:
    # Convert for JSON serialization
    json_results = {
        'statistics': results['statistics'],
        'relevant_count': len(results['relevant']),
        'irrelevant_count': len(results['irrelevant']),
    }
    json.dump(json_results, f, indent=2)

print("✓ Job classifier exported to /tmp/job_classifier_module.pkl")
print("✓ Results saved to /tmp/classification_results.json")

## Summary

✅ **Notebook 6 Complete**

### Features:
- Job embedder (300-dimensional vectors)
- Neural network classifier (3 hidden layers)
- Batch classification
- User history training
- Relevance scoring

**Ready for use in Notebook 7 (Application Agent)**