In [2]:
import pandas as pd
import json
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- CRITICAL: Change working directory to the project root ---
# This ensures all subsequent relative paths are correct.
# Assumes the notebook is in 'notebooks/' and the project root is one level up.
try:
    current_dir = os.getcwd()
    if os.path.basename(current_dir) == 'notebooks':
        os.chdir(os.path.dirname(current_dir))
        print(f"Changed working directory to: {os.getcwd()}")
    else:
        print(f"Current working directory: {current_dir}")
        print("Assuming it's already the project root or a parent directory.")
except Exception as e:
    print(f"Error changing directory: {e}")
    exit() # Exit if we can't set the correct working directory


# --- 1. Load and Inspect master_resumes.jsonl ---
print("--- Loading master_resumes.jsonl ---")
resumes_data = []
try:
    # Path is relative to the project root
    with open('data/master_resumes.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            resumes_data.append(json.loads(line))
    master_resumes_df = pd.DataFrame(resumes_data)
    print("master_resumes.jsonl loaded successfully!")
    print("\nFirst 5 rows of master_resumes_df:")
    print(master_resumes_df.head())
    print("\nInfo for master_resumes_df:")
    master_resumes_df.info()
    print("\nSample of 'skills' column from master_resumes_df (first 3 entries):")
    for i, skills in enumerate(master_resumes_df['skills'].head(3)):
        print(f"Resume {i+1} Skills: {skills}")
    print("\n")

except FileNotFoundError:
    print("Error: 'data/master_resumes.jsonl' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading master_resumes.jsonl: {e}")
    exit()


# --- 2. Load and Inspect training_data.csv ---
print("--- Loading training_data.csv ---")
try:
    # Path is relative to the project root
    training_data_df = pd.read_csv('data/training_data.csv')
    print("training_data.csv loaded successfully!")
    print("\nFirst 5 rows of training_data_df:")
    print(training_data_df.head())
    print("\nInfo for training_data_df:")
    training_data_df.info()
    
    print("\nSample of 'model_response' column from training_data_df (first 3 entries, parsed JSON):")
    for i, json_str in enumerate(training_data_df['model_response'].head(3)):
        try:
            parsed_json = json.loads(json_str)
            print(f"JD {i+1} Model Response: {json.dumps(parsed_json, indent=2)}")
        except json.JSONDecodeError:
            print(f"JD {i+1} Model Response (Error parsing JSON): {json_str}")
    print("\n")
    
except FileNotFoundError:
    print("Error: 'data/training_data.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading training_data.csv: {e}")
    exit()


# --- 3. Load and Inspect roles-based-on-skills.csv ---
print("--- Loading roles-based-on-skills.csv ---")
try:
    # Path is relative to the project root
    roles_skills_df = pd.read_csv('data/roles-based-on-skills.csv')
    print("roles-based-on-skills.csv loaded successfully!")
    
    # Correct column name for Job Role for consistency
    roles_skills_df.rename(columns={'Target': 'Job Role'}, inplace=True)

    print("\nFirst 5 rows of roles_skills_df after renaming:")
    print(roles_skills_df.head())
    print("\nInfo for roles_skills_df:")
    roles_skills_df.info()
    print("\nUnique Job Roles and their counts:")
    print(roles_skills_df['Job Role'].value_counts())
    print("-" * 50)

except FileNotFoundError:
    print("Error: 'data/roles-based-on-skills.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading roles-based-on-skills.csv: {e}")
    exit()

# --- Prepare Data for Job Role Classification ---
X = roles_skills_df['ALL']  # The column containing all skills as a string
y = roles_skills_df['Job Role'] # The target job role

# 1. Feature Engineering: Convert text skills to numerical features using TF-IDF
print("\n--- Performing TF-IDF Vectorization ---")
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to top 5000 for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X)
print(f"Shape of TF-IDF matrix: {X_tfidf.shape}")
print("-" * 50)

# 2. Split Data into Training and Testing Sets
print("\n--- Splitting data into training and testing sets ---")
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 50)

# 3. Model Training: Logistic Regression Classifier
print("\n--- Training Logistic Regression Model ---")
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear') # Increased max_iter for convergence
model.fit(X_train, y_train)
print("Model training complete!")
print("-" * 50)

# 4. Model Evaluation
print("\n--- Evaluating Model Performance ---")
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("-" * 50)

print("\n--- Job Role Classification Pipeline Complete ---")
print("You now have a trained model that can predict job roles based on skills!")


# --- Save Trained Model and TF-IDF Vectorizer ---
print("\n--- Saving Trained Model and Vectorizer ---")
# Define paths to save the model and vectorizer within the backend/models folder
# These paths are relative to the project root, which is the current working directory
# if you launched Jupyter from there.
models_dir = 'backend/models'
os.makedirs(models_dir, exist_ok=True) # This creates the directory if it doesn't exist

vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.joblib')
model_path = os.path.join(models_dir, 'job_role_classifier_model.joblib')

# Save the TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF Vectorizer saved to: {vectorizer_path}")

# Save the trained Logistic Regression Model
joblib.dump(model, model_path)
print(f"Job Role Classifier Model saved to: {model_path}")
print("-" * 50)

Changed working directory to: D:\RESUME-ANALYSER-PROJECT
--- Loading master_resumes.jsonl ---
master_resumes.jsonl loaded successfully!

First 5 rows of master_resumes_df:
                                       personal_info  \
0  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
1  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
2  {'name': 'Not Provided', 'email': 'Not Provide...   
3  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
4  {'name': '', 'email': '', 'phone': '', 'locati...   

                                          experience  \
0  [{'company': 'Fresher', 'company_info': {'indu...   
1  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
2  [{'company': 'Parkar Consulting and Labs', 'co...   
3  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
4  [{'company': 'Atos Syntel', 'company_info': {'...   

                                           education  \
0  [{'degree': {'level': 'ME', 'field': 'Computer...   
1  [{'degree': {'level': 'B.E', 'field': '



Model training complete!
--------------------------------------------------

--- Evaluating Model Performance ---
Accuracy: 0.8908

Classification Report:
                           precision    recall  f1-score   support

         Business Analyst       0.93      0.96      0.94        98
           Cyber Security       0.97      0.91      0.94        78
            Data Engineer       0.78      0.57      0.66        54
             Data Science       0.86      0.83      0.85        84
                   DevOps       0.93      0.90      0.92        94
Machine Learning Engineer       0.80      0.82      0.81        97
     Mobile App Developer       0.96      0.95      0.95        78
         Network Engineer       0.88      0.93      0.91        98
        Quality Assurance       0.98      0.95      0.96       101
        Software Engineer       0.82      0.93      0.87       134

                 accuracy                           0.89       916
                macro avg       0.89   

In [1]:
import pandas as pd
import json
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- CRITICAL: Change working directory to the project root ---
# This approach is more robust for Jupyter Notebooks as it doesn't rely on __file__.
# It assumes the notebook is in a 'notebooks/' subfolder directly under the project root.
try:
    current_working_dir = os.getcwd()
    # Check if the current working directory ends with 'notebooks'
    if current_working_dir.endswith(os.path.join('notebooks', '')) or current_working_dir.endswith('notebooks'):
        # Go up one level to the project root
        project_root_dir = os.path.abspath(os.path.join(current_working_dir, os.pardir))
        os.chdir(project_root_dir)
        print(f"Changed working directory to: {os.getcwd()}")
    else:
        print(f"Current working directory: {current_working_dir}")
        print("Assuming it's already the project root or a parent directory.")
except Exception as e:
    print(f"Error changing directory. Please ensure your notebook is run from within the 'notebooks' folder, or that the 'notebooks' folder is directly under your project root: {e}")
    exit() # Exit if we can't set the correct working directory


# --- 1. Load and Inspect master_resumes.jsonl ---
print("--- Loading master_resumes.jsonl ---")
resumes_data = []
try:
    # Path is relative to the project root
    with open(os.path.join('data', 'master_resumes.jsonl'), 'r', encoding='utf-8') as f:
        for line in f:
            resumes_data.append(json.loads(line))
    master_resumes_df = pd.DataFrame(resumes_data)
    print("master_resumes.jsonl loaded successfully!")
    print("\nFirst 5 rows of master_resumes_df:")
    print(master_resumes_df.head())
    print("\nInfo for master_resumes_df:")
    master_resumes_df.info()
    print("\nSample of 'skills' column from master_resumes_df (first 3 entries):")
    for i, skills in enumerate(master_resumes_df['skills'].head(3)):
        print(f"Resume {i+1} Skills: {skills}")
    print("\n")

except FileNotFoundError:
    print("Error: 'data/master_resumes.jsonl' not found. Ensure it's in the 'data' folder at the project root and Jupyter is launched from the root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading master_resumes.jsonl: {e}")
    exit()


# --- 2. Load and Inspect training_data.csv ---
print("--- Loading training_data.csv ---")
try:
    # Path is relative to the project root
    training_data_df = pd.read_csv(os.path.join('data', 'training_data.csv'))
    print("training_data.csv loaded successfully!")
    print("\nFirst 5 rows of training_data_df:")
    print(training_data_df.head())
    print("\nInfo for training_data_df:")
    training_data_df.info()
    
    print("\nSample of 'model_response' column from training_data_df (first 3 entries, parsed JSON):")
    for i, json_str in enumerate(training_data_df['model_response'].head(3)):
        try:
            parsed_json = json.loads(json_str)
            print(f"JD {i+1} Model Response: {json.dumps(parsed_json, indent=2)}")
        except json.JSONDecodeError:
            print(f"JD {i+1} Model Response (Error parsing JSON): {json_str}")
    print("\n")
    
except FileNotFoundError:
    print("Error: 'data/training_data.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading training_data.csv: {e}")
    exit()


# --- 3. Load and Inspect roles-based-on-skills.csv ---
print("--- Loading roles-based-on-skills.csv ---")
try:
    # Path is relative to the project root
    roles_skills_df = pd.read_csv(os.path.join('data', 'roles-based-on-skills.csv'))
    print("roles-based-on-skills.csv loaded successfully!")
    
    # Correct column name for Job Role for consistency
    roles_skills_df.rename(columns={'Target': 'Job Role'}, inplace=True)

    print("\nFirst 5 rows of roles_skills_df after renaming:")
    print(roles_skills_df.head())
    print("\nInfo for roles_skills_df:")
    roles_skills_df.info()
    print("\nUnique Job Roles and their counts:")
    print(roles_skills_df['Job Role'].value_counts())
    print("-" * 50)

except FileNotFoundError:
    print("Error: 'data/roles-based-on-skills.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading roles-based-on-skills.csv: {e}")
    exit()

# --- Prepare Data for Job Role Classification ---
# This part will only run if the above data loading was successful
X = roles_skills_df['ALL']  # The column containing all skills as a string
y = roles_skills_df['Job Role'] # The target job role

# 1. Feature Engineering: Convert text skills to numerical features using TF-IDF
print("\n--- Performing TF-IDF Vectorization ---")
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to top 5000 for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X)
print(f"Shape of TF-IDF matrix: {X_tfidf.shape}")
print("-" * 50)

# 2. Split Data into Training and Testing Sets
print("\n--- Splitting data into training and testing sets ---")
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 50)

# 3. Model Training: Logistic Regression Classifier
print("\n--- Training Logistic Regression Model ---")
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear') # Increased max_iter for convergence
model.fit(X_train, y_train)
print("Model training complete!")
print("-" * 50)

# 4. Model Evaluation
print("\n--- Evaluating Model Performance ---")
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("-" * 50)

print("\n--- Job Role Classification Pipeline Complete ---")
print("You now have a trained model that can predict job roles based on skills!")


# --- Save Trained Model and Vectorizer ---
print("\n--- Saving Trained Model and Vectorizer ---")
# Define paths to save the model and vectorizer within the backend/models folder
# These paths are relative to the project root, which is now the current working directory.
models_dir = os.path.join('backend', 'models')
os.makedirs(models_dir, exist_ok=True) # This creates the directory if it doesn't exist

vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.joblib')
model_path = os.path.join(models_dir, 'job_role_classifier_model.joblib')

# Save the TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF Vectorizer saved to: {vectorizer_path}")

# Save the trained Logistic Regression Model
joblib.dump(model, model_path)
print(f"Job Role Classifier Model saved to: {model_path}")
print("-" * 50)

Changed working directory to: D:\RESUME-ANALYSER-PROJECT
--- Loading master_resumes.jsonl ---
master_resumes.jsonl loaded successfully!

First 5 rows of master_resumes_df:
                                       personal_info  \
0  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
1  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
2  {'name': 'Not Provided', 'email': 'Not Provide...   
3  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
4  {'name': '', 'email': '', 'phone': '', 'locati...   

                                          experience  \
0  [{'company': 'Fresher', 'company_info': {'indu...   
1  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
2  [{'company': 'Parkar Consulting and Labs', 'co...   
3  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
4  [{'company': 'Atos Syntel', 'company_info': {'...   

                                           education  \
0  [{'degree': {'level': 'ME', 'field': 'Computer...   
1  [{'degree': {'level': 'B.E', 'field': '



Model training complete!
--------------------------------------------------

--- Evaluating Model Performance ---
Accuracy: 0.8908

Classification Report:
                           precision    recall  f1-score   support

         Business Analyst       0.93      0.96      0.94        98
           Cyber Security       0.97      0.91      0.94        78
            Data Engineer       0.78      0.57      0.66        54
             Data Science       0.86      0.83      0.85        84
                   DevOps       0.93      0.90      0.92        94
Machine Learning Engineer       0.80      0.82      0.81        97
     Mobile App Developer       0.96      0.95      0.95        78
         Network Engineer       0.88      0.93      0.91        98
        Quality Assurance       0.98      0.95      0.96       101
        Software Engineer       0.82      0.93      0.87       134

                 accuracy                           0.89       916
                macro avg       0.89   

In [2]:
import pandas as pd
import json
import ast # Used for literal evaluation of string representations of Python data structures
from tqdm import tqdm # For progress bars, install if not available: pip install tqdm

print("--- Preparing Labeled Data for Job Description Parsing ---")

# Load training_data.csv
try:
    # Path is relative to the project root
    training_data_df = pd.read_csv('data/training_data.csv')
    print("training_data.csv loaded successfully.")
except FileNotFoundError:
    print("Error: 'data/training_data.csv' not found. Please ensure it's in the 'data' folder at the project root and Jupyter is launched from the root.")
    exit()
except Exception as e:
    print(f"An error occurred while loading training_data.csv: {e}")
    exit()

# Initialize lists to store extracted data
parsed_jd_data = []

# Process each row in the DataFrame to parse 'model_response'
print("Parsing 'model_response' column and extracting entities...")
for index, row in tqdm(training_data_df.iterrows(), total=training_data_df.shape[0]):
    job_description_text = row['job_description']
    model_response_str = row['model_response']
    
    # Attempt to parse the JSON string
    try:
        parsed_response = json.loads(model_response_str)
    except json.JSONDecodeError:
        # Fallback for malformed JSON-like strings
        try:
            parsed_response = ast.literal_eval(model_response_str)
        except (ValueError, SyntaxError) as e:
            print(f"Skipping row {index} due to parsing error in 'model_response': {e}")
            continue

    # Extract relevant entities. Extend this to other fields as needed.
    required_skills_text = parsed_response.get('Required Skills', '')
    experience_level_text = parsed_response.get('Experience Level', '')
    educational_requirements_text = parsed_response.get('Educational Requirements', '')

    parsed_jd_data.append({
        'job_description': job_description_text,
        'required_skills': required_skills_text,
        'experience_level': experience_level_text,
        'educational_requirements': educational_requirements_text,
        # Add more fields here from 'model_response' as needed (e.g., "Core Responsibilities")
    })

# Convert the list of dictionaries to a new DataFrame
parsed_jd_df = pd.DataFrame(parsed_jd_data)

print("\nParsed Job Description DataFrame Head:")
print(parsed_jd_df.head())

print("\nInfo for Parsed Job Description DataFrame:")
parsed_jd_df.info()

print("\nSample of Job Description and its Extracted Entities (first 3 entries):")
for i in range(3):
    if i < len(parsed_jd_df):
        print(f"\n--- JD {i+1} ---")
        print(f"Job Description (Snippet):\n{parsed_jd_df.loc[i, 'job_description'][:500]}...") # Print first 500 chars
        print(f"Required Skills:\n{parsed_jd_df.loc[i, 'required_skills']}")
        print(f"Experience Level:\n{parsed_jd_df.loc[i, 'experience_level']}")
        print(f"Educational Requirements:\n{parsed_jd_df.loc[i, 'educational_requirements']}")

print("\n--- Labeled Data Preparation Complete ---")
print("This DataFrame (parsed_jd_df) is now suitable for training an NLP model (e.g., NER) to extract these entities from raw job descriptions.")

# Optional: Save this processed DataFrame for later use
# You can uncomment the line below if you want to save this intermediate dataframe.
# parsed_jd_df.to_csv('data/processed_jds_for_ner_training.csv', index=False)
# print("\nProcessed JD data saved to data/processed_jds_for_ner_training.csv")

--- Preparing Labeled Data for Job Description Parsing ---
training_data.csv loaded successfully.
Parsing 'model_response' column and extracting entities...


100%|█████████████████████████████████████████████████████████████████████████████| 853/853 [00:00<00:00, 13243.10it/s]


Parsed Job Description DataFrame Head:
                                     job_description  \
0  minimum qualifications\nbachelors degree or eq...   
1  description\nas an asc you will be highly infl...   
2  its an amazing time to be joining netflix as w...   
3  description\n\nweb designers looking to expand...   
4  at trackfive weve got big goals were on a miss...   

                                     required_skills  \
0  Bachelor's degree or equivalent experience. Ex...   
1  a passion to help people understand how apple ...   
2  2+ years experience in preferably outbound lic...   
3  2+ years experience in web design. Proficiency...   
4  2+ years of experience with HTML and CSS/SASS,...   

                                    experience_level  \
0  Experience managing enterprise SaaS accounts a...   
1                                    years preferred   
2  2+ years experience in preferably outbound lic...   
3                                           2+ years   
4     




In [6]:
import pandas as pd
import json
import joblib
import os
import re # <-- CRITICAL: Added this import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- CRITICAL: Change working directory to the project root ---
# This approach is more robust for Jupyter Notebooks as it doesn't rely on __file__.
# It assumes the notebook is in a 'notebooks/' subfolder directly under the project root.
try:
    current_working_dir = os.getcwd()
    # Check if the current working directory ends with 'notebooks'
    if current_working_dir.endswith(os.path.join('notebooks', '')) or current_working_dir.endswith('notebooks'):
        # Go up one level to the project root
        project_root_dir = os.path.abspath(os.path.join(current_working_dir, os.pardir))
        os.chdir(project_root_dir)
        print(f"Changed working directory to: {os.getcwd()}")
    else:
        print(f"Current working directory: {current_working_dir}")
        print("Assuming it's already the project root or a parent directory.")
except Exception as e:
    print(f"Error changing directory. Please ensure your notebook is run from within the 'notebooks' folder, or that the 'notebooks' folder is directly under your project root: {e}")
    exit() # Exit if we can't set the correct working directory


# --- 1. Load and Inspect master_resumes.jsonl ---
print("--- Loading master_resumes.jsonl ---")
resumes_data = []
try:
    # Path is relative to the project root
    with open(os.path.join('data', 'master_resumes.jsonl'), 'r', encoding='utf-8') as f:
        for line in f:
            resumes_data.append(json.loads(line))
    master_resumes_df = pd.DataFrame(resumes_data)
    print("master_resumes.jsonl loaded successfully!")
    print("\nFirst 5 rows of master_resumes_df:")
    print(master_resumes_df.head())
    print("\nInfo for master_resumes_df:")
    master_resumes_df.info()
    print("\nSample of 'skills' column from master_resumes_df (first 3 entries):")
    for i, skills in enumerate(master_resumes_df['skills'].head(3)):
        print(f"Resume {i+1} Skills: {skills}")
    print("\n")

except FileNotFoundError:
    print("Error: 'data/master_resumes.jsonl' not found. Ensure it's in the 'data' folder at the project root and Jupyter is launched from the root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading master_resumes.jsonl: {e}")
    exit()


# --- 2. Load and Inspect training_data.csv ---
print("--- Loading training_data.csv ---")
try:
    # Path is relative to the project root
    training_data_df = pd.read_csv(os.path.join('data', 'training_data.csv'))
    print("training_data.csv loaded successfully!")
    print("\nFirst 5 rows of training_data_df:")
    print(training_data_df.head())
    print("\nInfo for training_data_df:")
    training_data_df.info()
    
    print("\nSample of 'model_response' column from training_data_df (first 3 entries, parsed JSON):")
    for i, json_str in enumerate(training_data_df['model_response'].head(3)):
        try:
            parsed_json = json.loads(json_str)
            print(f"JD {i+1} Model Response: {json.dumps(parsed_json, indent=2)}")
        except json.JSONDecodeError:
            print(f"JD {i+1} Model Response (Error parsing JSON): {json_str}")
    print("\n")
    
except FileNotFoundError:
    print("Error: 'data/training_data.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading training_data.csv: {e}")
    exit()


# --- 3. Load and Inspect roles-based-on-skills.csv ---
print("--- Loading roles-based-on-skills.csv ---")
try:
    # Path is relative to the project root
    roles_skills_df = pd.read_csv(os.path.join('data', 'roles-based-on-skills.csv'))
    print("roles-based-on-skills.csv loaded successfully!")
    
    # Correct column name for Job Role for consistency
    roles_skills_df.rename(columns={'Target': 'Job Role'}, inplace=True)

    print("\nFirst 5 rows of roles_skills_df after renaming:")
    print(roles_skills_df.head())
    print("\nInfo for roles_skills_df:")
    roles_skills_df.info()
    print("\nUnique Job Roles and their counts:")
    print(roles_skills_df['Job Role'].value_counts())
    print("-" * 50)

except FileNotFoundError:
    print("Error: 'data/roles-based-on-skills.csv' not found. Ensure it's in the 'data' folder at the project root.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading roles-based-on-skills.csv: {e}")
    exit()

# --- Prepare Data for Job Role Classification ---
X = roles_skills_df['ALL']  # The column containing all skills as a string
y = roles_skills_df['Job Role'] # The target job role

# 1. Feature Engineering: Convert text skills to numerical features using TF-IDF
print("\n--- Performing TF-IDF Vectorization ---")
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to top 5000 for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X)
print(f"Shape of TF-IDF matrix: {X_tfidf.shape}")
print("-" * 50)

# 2. Split Data into Training and Testing Sets
print("\n--- Splitting data into training and testing sets ---")
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 50)

# 3. Model Training: Logistic Regression Classifier
print("\n--- Training Logistic Regression Model ---")
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear') # Increased max_iter for convergence
model.fit(X_train, y_train)
print("Model training complete!")
print("-" * 50)

# 4. Model Evaluation
print("\n--- Evaluating Model Performance ---")
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("-" * 50)

print("\n--- Job Role Classification Pipeline Complete ---")
print("You now have a trained model that can predict job roles based on skills!")


# --- Save Trained Model and Vectorizer ---
print("\n--- Saving Trained Model and Vectorizer ---")
# Define paths to save the model and vectorizer within the backend/models folder
# These paths are relative to the project root, which is now the current working directory.
models_dir = os.path.join('backend', 'models')
os.makedirs(models_dir, exist_ok=True) # This creates the directory if it doesn't exist

vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.joblib')
model_path = os.path.join(models_dir, 'job_role_classifier_model.joblib')

# Save the TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF Vectorizer saved to: {vectorizer_path}")

# Save the trained Logistic Regression Model
joblib.dump(model, model_path)
print(f"Job Role Classifier Model saved to: {model_path}")
print("-" * 50)

Current working directory: D:\RESUME-ANALYSER-PROJECT
Assuming it's already the project root or a parent directory.
--- Loading master_resumes.jsonl ---
master_resumes.jsonl loaded successfully!

First 5 rows of master_resumes_df:
                                       personal_info  \
0  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
1  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
2  {'name': 'Not Provided', 'email': 'Not Provide...   
3  {'name': 'Unknown', 'email': 'Unknown', 'phone...   
4  {'name': '', 'email': '', 'phone': '', 'locati...   

                                          experience  \
0  [{'company': 'Fresher', 'company_info': {'indu...   
1  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
2  [{'company': 'Parkar Consulting and Labs', 'co...   
3  [{'company': 'Delta Controls, Dubai FZCO', 'co...   
4  [{'company': 'Atos Syntel', 'company_info': {'...   

                                           education  \
0  [{'degree': {'level': 'ME', 'field':



Model training complete!
--------------------------------------------------

--- Evaluating Model Performance ---
Accuracy: 0.8908

Classification Report:
                           precision    recall  f1-score   support

         Business Analyst       0.93      0.96      0.94        98
           Cyber Security       0.97      0.91      0.94        78
            Data Engineer       0.78      0.57      0.66        54
             Data Science       0.86      0.83      0.85        84
                   DevOps       0.93      0.90      0.92        94
Machine Learning Engineer       0.80      0.82      0.81        97
     Mobile App Developer       0.96      0.95      0.95        78
         Network Engineer       0.88      0.93      0.91        98
        Quality Assurance       0.98      0.95      0.96       101
        Software Engineer       0.82      0.93      0.87       134

                 accuracy                           0.89       916
                macro avg       0.89   

In [11]:
import pandas as pd
import json
import ast
import spacy
from tqdm import tqdm # For progress bars, install if not available: pip install tqdm
import random # For splitting data
from spacy.tokens import DocBin # For saving data in spaCy's format
import re # IMPORTANT: Ensure this is imported for regex operations

print("\n--- Preparing Data for spaCy Custom NER Training (with Overlap Resolution) ---")

# Ensure spaCy model is loaded for tokenization (even if not using its NER initially)
try:
    nlp_sm = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model 'en_core_web_sm' not found. Running: python -m spacy download en_core_web_sm")
    spacy.cli.download("en_core_web_sm")
    # After download, it's CRITICAL to restart the kernel for spaCy to load dependencies correctly.
    print("Downloaded 'en_core_web_sm'. Please RESTART YOUR JUPYTER KERNEL (Kernel -> Restart) and run all cells again.")
    exit() # Exit to force kernel restart

# Assuming parsed_jd_df from the previous cell is available in your notebook
if 'parsed_jd_df' not in locals():
    print("Error: 'parsed_jd_df' not found. Please run the previous cell to load and parse training_data.csv.")
    exit()

# List to store spaCy-compatible training examples
training_examples = []

# Define the entities we want to extract and map them to their corresponding columns
ENTITY_COLUMNS = { # Corrected variable name here
    'REQUIRED_SKILLS': 'required_skills',
    'EXPERIENCE_LEVEL': 'experience_level',
    'EDUCATIONAL_REQUIREMENTS': 'educational_requirements'
}

# Iterate through the parsed job description data
print("Creating spaCy training examples (text and entity spans) with overlap resolution...")
for index, row in tqdm(parsed_jd_df.iterrows(), total=parsed_jd_df.shape[0]):
    text = str(row['job_description']) # Ensure job_description is treated as string
    candidate_entities = [] # Collect all potential entities first

    # Process each entity type
    for entity_type, col_name in ENTITY_COLUMNS.items(): # Corrected variable name here
        entity_text = str(row[col_name]).strip() # Ensure it's a string and strip whitespace
        
        # Skip if entity text is empty or common placeholders
        if not entity_text or entity_text.lower() in ["n/a", "not provided", "unknown", "none"]:
            continue

        # Find all occurrences of the entity text in the job description
        # Using re.finditer to get start and end indices for all matches
        for match in re.finditer(re.escape(entity_text), text, re.IGNORECASE | re.DOTALL):
            start, end = match.span()
            # Basic validation: ensure the span makes sense
            if start < end and end <= len(text):
                matched_snippet = text[start:end]
                # Ensure the case-insensitive match is a substantial match to the entity text
                if matched_snippet.lower() == entity_text.lower():
                    candidate_entities.append({'start': start, 'end': end, 'label': entity_type, 'text': matched_snippet})

    # --- Overlap Resolution Logic ---
    # Sort candidates: prioritize longer spans, then by start position
    candidate_entities.sort(key=lambda x: (x['end'] - x['start'], -x['start']), reverse=True) # Longest first, then earliest
    
    final_entities = []
    # Keep track of covered token indices (approximate) or character ranges
    covered_ranges = []

    for cand_ent in candidate_entities:
        is_overlapping = False
        # Check if the candidate entity overlaps with any already accepted entity
        for final_ent in final_entities:
            # Check for any overlap: [start1, end1) and [start2, end2) overlap if max(start1, start2) < min(end1, end2)
            if max(cand_ent['start'], final_ent['start']) < min(cand_ent['end'], final_ent['end']):
                is_overlapping = True
                break
        
        if not is_overlapping:
            final_entities.append(cand_ent)

    # Convert final_entities to the (start, end, label) tuple format required by spaCy
    spacy_ents = [(ent['start'], ent['end'], ent['label']) for ent in final_entities]
    
    if spacy_ents: # Only add if there are valid, non-overlapping entities
        training_examples.append((text, {"entities": spacy_ents}))
    else:
        # It's okay to have documents without entities (negative examples),
        # but for this specific dataset and simplified pipeline,
        # we focus on documents with at least one found entity.
        pass


print(f"\nGenerated {len(training_examples)} spaCy training examples after overlap resolution.")

# --- Split Data into Training and Validation Sets ---
random.seed(42)
random.shuffle(training_examples)

split_point = int(len(training_examples) * 0.8)
train_data = training_examples[:split_point]
dev_data = training_examples[split_point:]

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(dev_data)}")

# --- Save Data in spaCy's Binary Format (.spacy) ---
output_dir = 'data/spacy_training_data'
os.makedirs(output_dir, exist_ok=True)

train_docbin = DocBin()
for text, annot in tqdm(train_data, desc="Processing train data"):
    doc = nlp_sm.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is not None: # char_span returns None if character indices don't align perfectly with tokens
            ents.append(span)
    try:
        doc.ents = ents # Assign entities
        train_docbin.add(doc)
    except ValueError as e:
        print(f"Skipping document due to ValueError after span creation: {e} in text: '{text[:200]}...'")


dev_docbin = DocBin()
for text, annot in tqdm(dev_data, desc="Processing dev data"):
    doc = nlp_sm.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    try:
        doc.ents = ents
        dev_docbin.add(doc)
    except ValueError as e:
        print(f"Skipping dev document due to ValueError after span creation: {e} in text: '{text[:200]}...'")


train_docbin.to_disk(os.path.join(output_dir, "train.spacy"))
print(f"Train data saved to: {os.path.join(output_dir, 'train.spacy')}")


dev_docbin.to_disk(os.path.join(output_dir, "dev.spacy"))
print(f"Validation data saved to: {os.path.join(output_dir, 'dev.spacy')}")

print("\n--- spaCy Data Preparation Complete. Ready for Training! ---")


--- Preparing Data for spaCy Custom NER Training (with Overlap Resolution) ---
Creating spaCy training examples (text and entity spans) with overlap resolution...


100%|██████████████████████████████████████████████████████████████████████████████| 853/853 [00:00<00:00, 1217.37it/s]



Generated 89 spaCy training examples after overlap resolution.
Training examples: 71
Validation examples: 18


Processing train data: 100%|██████████████████████████████████████████████████████████| 71/71 [00:00<00:00, 161.78it/s]
Processing dev data: 100%|████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 264.72it/s]

Train data saved to: data/spacy_training_data\train.spacy
Validation data saved to: data/spacy_training_data\dev.spacy

--- spaCy Data Preparation Complete. Ready for Training! ---





In [6]:
import os
import spacy
# from spacy.util import to_disk # No longer explicitly importing, will use method directly

# Define the configuration as a Python dictionary
config_dict = {
    "paths": {
        "train": "data/spacy_training_data/train.spacy",
        "dev": "data/spacy_training_data/dev.spacy",
        "vectors": None,
        "init_tok2vec": None
    },
    "system": {
        "gpu_allocator": None,
        "seed": 0
    },
    "nlp": {
        "lang": "en",
        "pipeline": ["transformer", "ner"],
        "batch_size": 128,
        "disabled": [],
        "before_creation": None,
        "after_creation": None,
        "after_pipeline_creation": None,
        "tokenizer": {"@tokenizers":"spacy.Tokenizer.v1"},
        "vectors": {"@vectors":"spacy.Vectors.v1"}
    },
    "components": {
        "transformer": {
            "factory": "transformer",
            "max_batch_items": 4096,
            "set_extra_annotations": {"from_tokenizer": True},
            "model": {
                "@architectures": "spacy-transformers.TransformerModel.v3",
                "name": "bert-base-uncased", # Set your desired transformer model here
                "tokenizer_config": {"use_fast": True},
                "grad_factor": 1.0
            }
        },
        "ner": {
            "factory": "ner",
            "incorrect_spans_key": None,
            "moves": None,
            "scorer": {"@scorers":"spacy.ner_scorer.v1"},
            "update_with_oracle_cut_size": 100,
            "model": {
                "@architectures": "spacy.TransitionBasedParser.v2",
                "state_type": "ner",
                "extra_state_tokens": False,
                "hidden_width": 64,
                "maxout_pieces": 2,
                "use_upper": True,
                "nO": None,
                "tok2vec": {
                    "@architectures": "spacy-transformers.TransformerListener.v1",
                    "upstream": "*"
                }
            }
        }
    },
    "corpora": {
        "dev": {
            "@readers": "spacy.Corpus.v1",
            "path": {"@variables":"${paths.dev}"},
            "max_length": 0,
            "gold_preproc": False,
            "limit": 0,
            "augmenter": None
        },
        "train": {
            "@readers": "spacy.Corpus.v1",
            "path": {"@variables":"${paths.train}"},
            "max_length": 0,
            "gold_preproc": False,
            "limit": 0,
            "augmenter": None
        }
    },
    "training": {
        "dev_corpus": "corpora.dev",
        "train_corpus": "corpora.train",
        "seed": {"@variables":"${system.seed}"},
        "gpu_allocator": {"@variables":"${system.gpu_allocator}"},
        "dropout": 0.1,
        "accumulate_gradient": 1,
        "patience": 1600,
        "max_epochs": 0,
        "max_steps": 20000,
        "eval_frequency": 200,
        "frozen_components": [],
        "annotating_components": [],
        "before_to_disk": None,
        "before_update": None,
        "batcher": {
            "@batchers": "spacy.batch_by_words.v1",
            "discard_oversize": False,
            "tolerance": 0.2,
            "get_length": None,
            "size": {
                "@schedules": "compounding.v1",
                "start": 100,
                "stop": 1000,
                "compound": 1.001,
                "t": 0.0
            }
        },
        "logger": {
            "@loggers": "spacy.ConsoleLogger.v1",
            "progress_bar": False
        },
        "optimizer": {
            "@optimizers": "Adam.v1",
            "beta1": 0.9,
            "beta2": 0.999,
            "L2_is_weight_decay": True,
            "L2": 0.01,
            "grad_clip": 1.0,
            "use_averages": False,
            "eps": 0.00000001,
            "learn_rate": 0.001
        },
        "score_weights": {
            "ents_f": 1.0,
            "ents_p": 0.0,
            "ents_r": 0.0,
            "ents_per_type": None
        }
    },
    "pretraining": {},
    "initialize": {
        "vectors": {"@variables":"${paths.vectors}"},
        "init_tok2vec": {"@variables":"${paths.init_tok2vec}"},
        "vocab_data": None,
        "lookups": None,
        "before_init": None,
        "after_init": None,
        "components": {},
        "tokenizer": {}
    }
}

# Ensure the 'configs' directory exists
configs_dir = 'configs'
os.makedirs(configs_dir, exist_ok=True)

# Define the path for the config file
config_file_path = os.path.join(configs_dir, 'ner_transformer_config.cfg')

# Convert the dictionary to a spaCy Config object and save it
# We are now using the .to_disk() method directly on the Config object,
# which does not take an 'exclude' argument for a Config object.
spacy_config = spacy.util.Config(config_dict)
spacy_config.to_disk(config_file_path) # Removed exclude=None

print(f"Generated and saved clean config to: {config_file_path}")
print("Now proceed to train the model using the command in your terminal.")

Generated and saved clean config to: configs\ner_transformer_config.cfg
Now proceed to train the model using the command in your terminal.
