In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
import os
from typing import List, Dict, Any

class JobRecommendationModel:
    """
    Final Model following Phase 2 Workflow:
    Data Cleaning -> Data Preprocessing -> Job Recommendation -> Skill Gap Analysis
    """

    def __init__(self, model_path="models/job_recommender.joblib"):
        self.model_path = model_path
        self.tfidf_vectorizer = None
        self.skill_mlb = MultiLabelBinarizer()
        self.job_embeddings = None
        self.job_metadata = None

    def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Step 1: Data Cleaning (Removes nulls and standardizes text)"""
        df = df.dropna(subset=['title', 'skills']).copy()
        # Convert skills to lowercase list for consistent matching
        df['skills_list'] = df['skills'].apply(
            lambda x: [s.strip().lower() for s in str(x).split(',')]
        )
        df['description'] = df['description'].fillna('')
        df['location'] = df['location'].fillna('Remote')
        return df

    def _create_features(self, df: pd.DataFrame):
        """Step 2: Data Preprocessing (TF-IDF + Skill Encoding)"""
        # Text Context (Title + Description)
        text_corpus = df['title'] + " " + df['description']
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        text_features = self.tfidf_vectorizer.fit_transform(text_corpus).toarray()

        # Skill Encoding
        skill_features = self.skill_mlb.fit_transform(df['skills_list'])

        # Combine and Normalize
        combined = np.hstack([text_features, skill_features])
        # L2 Normalization makes Cosine Similarity just a dot product
        norm = np.linalg.norm(combined, axis=1, keepdims=True) + 1e-8
        return combined / norm

    def train(self, csv_path: str):
        """Full Training Pipeline"""
        if not os.path.exists(csv_path):
            raise FileNotFoundError(f"Missing dataset at {csv_path}")

        df = pd.read_csv(csv_path)
        df = self._clean_data(df)
        self.job_embeddings = self._create_features(df)
        self.job_metadata = df.reset_index(drop=True)

        # Ensure directory exists and save
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
        joblib.dump({
            'tfidf': self.tfidf_vectorizer,
            'mlb': self.skill_mlb,
            'embeddings': self.job_embeddings,
            'metadata': self.job_metadata
        }, self.model_path)
        print(f"âœ… Model trained and saved to {self.model_path}")

    def load(self):
        """Load the saved components"""
        if not os.path.exists(self.model_path):
            return False
        data = joblib.load(self.model_path)
        self.tfidf_vectorizer = data['tfidf']
        self.skill_mlb = data['mlb']
        self.job_embeddings = data['embeddings']
        self.job_metadata = data['metadata']
        return True

    def recommend(self, user_skills: List[str], top_k=3) -> List[Dict]:
        """Step 3: Job Recommendation (Similarity Matching)"""
        if self.job_embeddings is None and not self.load():
            return []

        # Preprocess User Input
        user_skills_clean = [s.strip().lower() for s in user_skills]
        user_text_vec = self.tfidf_vectorizer.transform([" ".join(user_skills_clean)]).toarray()
        
        # Handle unseen skills gracefully
        try:
            user_skill_vec = self.skill_mlb.transform([user_skills_clean])
        except ValueError: # If skills are completely new
            user_skill_vec = np.zeros((1, len(self.skill_mlb.classes_)))

        user_vec = np.hstack([user_text_vec, user_skill_vec])
        user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)

        # Calculate Similarity
        scores = np.dot(self.job_embeddings, user_vec.T).flatten()
        top_idx = scores.argsort()[-top_k:][::-1]

        # Step 4: Skill Gap Analysis
        recommendations = []
        for idx in top_idx:
            job = self.job_metadata.iloc[idx]
            job_skills = set(job['skills_list'])
            user_set = set(user_skills_clean)
            
            missing = list(job_skills - user_set)
            match_pct = (len(job_skills - set(missing)) / len(job_skills)) * 100

            recommendations.append({
                "job_title": job['title'],
                "company": job['company'],
                "match_score": round(float(scores[idx]) * 100, 1),
                "skill_match_percent": round(match_pct, 1),
                "missing_skills": missing[:3], # Show top 3 missing
                "location": job['location']
            })
        return recommendations

# --- Execution ---
if __name__ == "__main__":
    # 1. Create directory structure
    os.makedirs("datasets", exist_ok=True)
    
    # 2. Sample Data for testing
    data = {
        "title": ["Data Scientist", "Web Developer", "Cloud Engineer"],
        "company": ["Google", "Meta", "Amazon"],
        "description": ["Analyze data and build ML models", "Build React apps", "Manage AWS infra"],
        "skills": ["Python, SQL, Machine Learning", "JavaScript, React, CSS", "AWS, Docker, Linux"],
        "location": ["Remote", "NY", "Seattle"]
    }
    pd.DataFrame(data).to_csv("datasets/job_postings.csv", index=False)

    # 3. Initialize and Run
    model = JobRecommendationModel()
    model.train("datasets/job_postings.csv")
    
    # 4. Get Recommendations
    results = model.recommend(["Python", "SQL"])
    
    print("\nðŸ“‹ FINAL RECOMMENDATIONS:")
    for r in results:
        print(f"ðŸ”¹ {r['job_title']} at {r['company']} ({r['match_score']}% Match)")
        if r['missing_skills']:
            print(f"   ðŸ’¡ Tip: Learn {', '.join(r['missing_skills'])} to improve fit.")

âœ… Model trained and saved to models/job_recommender.joblib

ðŸ“‹ FINAL RECOMMENDATIONS:
ðŸ”¹ Data Scientist at Google (70.7% Match)
   ðŸ’¡ Tip: Learn machine learning to improve fit.
ðŸ”¹ Cloud Engineer at Amazon (0.0% Match)
   ðŸ’¡ Tip: Learn aws, linux, docker to improve fit.
ðŸ”¹ Web Developer at Meta (0.0% Match)
   ðŸ’¡ Tip: Learn react, javascript, css to improve fit.
