In [4]:
import pandas as pd
import json
import joblib # For saving models
import os # For path manipulation
import fitz # PyMuPDF for PDF extraction
import docx2txt # For DOCX extraction (though not directly used for Mehyaar PDFs, good to have)
import re
import spacy
from spacy.tokens import DocBin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset # For loading Hugging Face datasets

# Ensure spaCy model is downloaded (if not already)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy 'en_core_web_sm' model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# --- 0. Verify Current Working Directory ---
print(f"Current Working Directory: {os.getcwd()}")
print("-" * 50)

# --- Utility Functions for Text Extraction ---
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        text = docx2txt.process(docx_path)
    except Exception as e:
        print(f"Error reading DOCX {docx_path}: {e}")
    return text

# --- 1. Load and Inspect master_resumes.jsonl ---
print("--- Loading master_resumes.jsonl ---")
master_resumes_df = pd.DataFrame() # Initialize empty DataFrame
try:
    resumes_data = []
    # MODIFIED PATH: Using ../data/ to go up one level then into data/
    with open('../data/master_resumes.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            resumes_data.append(json.loads(line))
    master_resumes_df = pd.DataFrame(resumes_data)
    print("master_resumes.jsonl loaded successfully!")
    print(f"Number of samples: {len(master_resumes_df)}")
    print("\nFirst 3 rows of master_resumes_df:")
    print(master_resumes_df.head(3))
    print("-" * 50)
except FileNotFoundError:
    print("Error: '../data/master_resumes.jsonl' not found. Please check path and file existence.")
except Exception as e:
    print(f"An unexpected error occurred while loading master_resumes.jsonl: {e}")

# --- 2. Load and Inspect training_data.csv ---
print("--- Loading training_data.csv (for Job Description analysis) ---")
training_data_df = pd.DataFrame()
try:
    # MODIFIED PATH
    training_data_df = pd.read_csv('../data/training_data.csv')
    print("training_data.csv loaded successfully!")
    print(f"Number of samples: {len(training_data_df)}")
    print("\nFirst 3 rows of training_data_df:")
    print(training_data_df.head(3))
    print("-" * 50)
except FileNotFoundError:
    print("Error: '../data/training_data.csv' not found. Please check path and file existence.")
except Exception as e:
    print(f"An unexpected error occurred while loading training_data.csv: {e}")


# --- 3. Load and Inspect roles-based-on-skills.csv ---
print("--- Loading roles-based-on-skills.csv (for Job Role Classification) ---")
roles_skills_df = pd.DataFrame()
try:
    # MODIFIED PATH
    roles_skills_df = pd.read_csv('../data/roles-based-on-skills.csv')
    print("roles-based-on-skills.csv loaded successfully!")
    roles_skills_df.rename(columns={'Target': 'Job Role'}, inplace=True)
    print(f"Number of samples: {len(roles_skills_df)}")
    print("\nFirst 3 rows of roles_skills_df after renaming:")
    print(roles_skills_df.head(3))
    print("-" * 50)
except FileNotFoundError:
    print("Error: '../data/roles-based-on-skills.csv' not found. Please check path and file existence.")
except Exception as e:
    print(f"An unexpected error occurred while loading roles-based-on-skills.csv: {e}")


# --- 4. Load Mehyaar/Annotated_NER_PDF_Resumes (Downloaded PDF files) ---
print("--- Loading Mehyaar/Annotated_NER_PDF_Resumes (PDFs) ---")
# MODIFIED PATH: Relative to notebooks/
mehyaar_pdf_dir = '../data/mehyaar_resumes_pdf/'
mehyaar_extracted_data = []

if os.path.exists(mehyaar_pdf_dir) and os.listdir(mehyaar_pdf_dir):
    pdf_files = [f for f in os.listdir(mehyaar_pdf_dir) if f.lower().endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files in {mehyaar_pdf_dir}. Extracting text...")
    for i, pdf_file in enumerate(pdf_files):
        if i >= 50: # Limit processing for initial exploration to 50 files for faster loading
            print("Limiting to first 50 PDFs for faster loading during exploration.")
            break
        pdf_path = os.path.join(mehyaar_pdf_dir, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        mehyaar_extracted_data.append({"filename": pdf_file, "text": text})

    mehyaar_pdfs_df = pd.DataFrame(mehyaar_extracted_data)
    print(f"Extracted text from {len(mehyaar_pdfs_df)} Mehyaar PDF resumes.")
    print("\nFirst row of Mehyaar PDFs extracted text (first 500 chars):")
    if not mehyaar_pdfs_df.empty:
        print(mehyaar_pdfs_df.iloc[0]['text'][:500] + "...")
    print("-" * 50)
else:
    print(f"Error: '{mehyaar_pdf_dir}' not found or empty. Please ensure Mehyaar PDF resumes are placed there.")


# --- 5. Load spacy_training_data (for custom NER training) ---
print("--- Loading spacy_training_data (train.spacy and dev.spacy) ---")
# MODIFIED PATHS
train_spacy_path = '../data/spacy_training_data/train.spacy'
dev_spacy_path = '../data/spacy_training_data/dev.spacy'

train_docs = None
dev_docs = None

if os.path.exists(train_spacy_path) and os.path.exists(dev_spacy_path):
    try:
        # Load DocBin files
        doc_bin = DocBin().from_disk(train_spacy_path)
        train_docs = list(doc_bin.get_docs(nlp.vocab)) # Use your loaded nlp vocab
        print(f"Loaded {len(train_docs)} training docs from train.spacy")

        doc_bin = DocBin().from_disk(dev_spacy_path)
        dev_docs = list(doc_bin.get_docs(nlp.vocab)) # Use your loaded nlp vocab
        print(f"Loaded {len(dev_docs)} development docs from dev.spacy")

        print("\nFirst document from train.spacy (text and entities):")
        if train_docs:
            sample_doc = train_docs[0]
            print(f"Text: {sample_doc.text[:200]}...")
            print("Entities:")
            for ent in sample_doc.ents:
                print(f"  - {ent.text} ({ent.label_})")
        print("-" * 50)

    except Exception as e:
        print(f"Error loading spaCy DocBin files: {e}")
        print("Please ensure 'train.spacy' and 'dev.spacy' are valid spaCy DocBin files.")
else:
    print(f"Error: '{train_spacy_path}' or '{dev_spacy_path}' not found. Please ensure spaCy training data is placed there.")


# --- 6. Load Job Matcher Data (cnamuangtoun/resume-job-description-fit alternative) ---
print("--- Loading Job Matcher Data (cnamuangtoun/resume-job-description-fit alternative) ---")
# MODIFIED PATHS
job_match_train_csv_path = '../data/job_match_data/train.csv' # Adjust if file names are different
job_match_test_csv_path = '../data/job_match_data/test.csv' # Adjust if file names are different

job_match_train_df = pd.DataFrame()
job_match_test_df = pd.DataFrame()

if os.path.exists(job_match_train_csv_path) and os.path.exists(job_match_test_csv_path):
    try:
        job_match_train_df = pd.read_csv(job_match_train_csv_path)
        job_match_test_df = pd.read_csv(job_match_test_csv_path)
        print("Job Matcher data (train.csv and test.csv) loaded successfully!")
        print(f"Train samples: {len(job_match_train_df)}, Test samples: {len(job_match_test_df)}")
        print("\nFirst 3 rows of Job Matcher training data:")
        print(job_match_train_df.head(3))
        print("\nColumns in Job Matcher training data:")
        print(job_match_train_df.columns.tolist())
        print("-" * 50)
    except Exception as e:
        print(f"Error loading Job Matcher CSV files: {e}")
else:
    print(f"Error: '{job_match_train_csv_path}' or '{job_match_test_csv_path}' not found.")
    print("Please ensure 'cnamuangtoun/resume-job-description-fit' (train.csv, test.csv) are downloaded and placed in 'data/job_match_data/'.")


# --- Job Role Classification Model Training (Existing Logic, remains the same) ---
if not roles_skills_df.empty:
    print("\n--- Training Job Role Classification Model ---")
    X = roles_skills_df['ALL']
    y = roles_skills_df['Job Role']

    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf_vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

    model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
    model.fit(X_train, y_train)
    print("Job Role Classification Model training complete!")

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    models_dir = '../backend/models' # MODIFIED PATH
    os.makedirs(models_dir, exist_ok=True)
    joblib.dump(tfidf_vectorizer, os.path.join(models_dir, 'tfidf_vectorizer.joblib'))
    joblib.dump(model, os.path.join(models_dir, 'job_role_classifier_model.joblib'))
    print("Job Role Classifier Model and Vectorizer saved.")
    print("-" * 50)
else:
    print("\nSkipping Job Role Classification: 'roles-based-on-skills.csv' not loaded or empty.")


# --- Placeholder for NER Model Training (Next Major Step) ---
print("\n--- Placeholder for NER Model Training (Next Major Step) ---")
if train_docs and dev_docs:
    print("Data for spaCy NER training is loaded. Next, you would train a custom NER model using spaCy's training pipeline.")
    print("This involves creating a spaCy config file, and running `python -m spacy train config.cfg --output ../backend/models/ner_model --paths.train ../data/spacy_training_data/train.spacy --paths.dev ../data/spacy_training_data/dev.spacy`") # MODIFIED PATHS in command
    print("You might also process 'mehyaar_pdfs_df' further to generate more NER training data if its raw PDFs come with corresponding annotations.")
else:
    print("Skipping NER Model Training placeholder: spaCy training data not fully loaded.")
print("-" * 50)


# --- Placeholder for Job Matcher Model Training ---
print("\n--- Placeholder for Job Matcher Model Training (Next Major Step) ---")
if not job_match_train_df.empty and not job_match_test_df.empty:
    print("Data for Job Matcher model training (cnamuangtoun) is loaded.")
    print("This typically involves:")
    print("1. Preprocessing 'resume_text' and 'job_description' columns from 'job_match_train_df' and 'job_match_test_df'.")
    print("2. Engineering features (e.g., embeddings from pre-trained models like BERT/RoBERTa using the 'transformers' library).")
    print("3. Defining a target variable for 'fit' (e.g., if there's a 'score' or 'match_label' column).")
    print("4. Training a classification or regression model to predict job fit (e.g., a Siamese network, or simple classifier on concatenated embeddings).")
else:
    print("Skipping Job Matcher Model Training placeholder: Job Matcher dataset not fully loaded.")
print("-" * 50)


print("\n--- All initial data analysis and foundational training setup complete. ---")
print("You are now ready to dive into detailed NER and Job Matcher model development!")

Current Working Directory: D:\RESUME-ANALYSER-PROJECT\notebooks
--------------------------------------------------
--- Loading master_resumes.jsonl ---
master_resumes.jsonl loaded successfully!
Number of samples: 4817

First 3 rows of master_resumes_df:
                                       personal_info  \
0  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
1  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
2  {'name': 'Not Provided', 'email': 'Not Provide...   

                                          experience  \
0  [{'company': 'Fresher', 'company_info': {'indu...   
1  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
2  [{'company': 'Parkar Consulting and Labs', 'co...   

                                           education  \
0  [{'degree': {'level': 'ME', 'field': 'Computer...   
1  [{'degree': {'level': 'B.E', 'field': 'Electro...   
2  [{'degree': {'level': 'B.E.', 'field': 'Not Pr...   

                                              skills  \
0  {'technical'



Job Role Classification Model training complete!
Accuracy: 0.8908
Classification Report:
                           precision    recall  f1-score   support

         Business Analyst       0.93      0.96      0.94        98
           Cyber Security       0.97      0.91      0.94        78
            Data Engineer       0.78      0.57      0.66        54
             Data Science       0.86      0.83      0.85        84
                   DevOps       0.93      0.90      0.92        94
Machine Learning Engineer       0.80      0.82      0.81        97
     Mobile App Developer       0.96      0.95      0.95        78
         Network Engineer       0.88      0.93      0.91        98
        Quality Assurance       0.98      0.95      0.96       101
        Software Engineer       0.82      0.93      0.87       134

                 accuracy                           0.89       916
                macro avg       0.89      0.88      0.88       916
             weighted avg       0.89  

In [2]:
import os; print(os.getcwd())

D:\RESUME-ANALYSER-PROJECT\notebooks
