In [None]:
import os
import re
import pandas as pd
import numpy as np
import kagglehub
from datasets import load_dataset

# --- CONFIGURATION ---ssdsa
EXPERIENCE_HEADERS = [
    r'professional experience', r'work experience', r'employment history',
    r'work history', r'experience', r'career history', r'professional background'
]

NEXT_SECTION_HEADERS = [
    r'education', r'academic background', r'skills', r'technical skills',
    r'projects', r'personal projects', r'certifications', r'achievements',
    r'references', r'languages', r'volunteer', r'interests'
]

# --- UPDATED CLASSIFICATION REGEX ---
# 1. Freelance
FREELANCE_REGEX = r'\b(freelance|freelancer|self-employed|upwork|fiverr|contractor|consultant)\b'

# 2. Intern (Added co-op)
INTERN_REGEX    = r'\b(intern|internship|trainee|summer analyst|student|apprentice|co-op)\b'

# 3. Professional / Senior (UPDATED TO FIX "DATA SCIENTIST" ISSUE)
# Added: Scientist, Engineer, Developer, Analyst, Associate, etc.
PROFESSIONAL_REGEX = r'\b(senior|manager|lead|principal|head of|chief|executive|years experience|scientist|engineer|developer|analyst|associate|specialist|administrator|officer|architect)\b'

# Contexts where "Intern" should be IGNORED
IGNORE_INTERN_CONTEXT = r'\b(managed|mentored|supervised|trained|hired|led|oversaw)\s+(an\s+|the\s+)?'

# ==========================================
# 1. LOAD DATA
# ==========================================
def load_data():
    print(">>> Step 1: Loading Raw Data...")
    dfs = []
    
    # Kaggle
    try:
        path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith(".csv"):
                    df = pd.read_csv(os.path.join(root, file))
                    if 'Resume_str' in df.columns:
                        dfs.append(df[['Resume_str']].rename(columns={'Resume_str': 'text'}))
                    break
    except: pass

    # Hugging Face
    try:
        dataset = load_dataset("InferencePrince555/Resume-Dataset")
        df = dataset['train'].to_pandas()
        col = 'Resume_test' if 'Resume_test' in df.columns else df.columns[0]
        dfs.append(df[[col]].rename(columns={col: 'text'}))
    except: pass

    if not dfs: return pd.DataFrame(columns=['text'])
    
    df_combined = pd.concat(dfs, ignore_index=True).dropna().drop_duplicates(subset=['text'])
    print(f"   - Total Raw Resumes: {len(df_combined)}")
    return df_combined

# ==========================================
# 2. EXTRACT EXPERIENCE SECTION
# ==========================================
def extract_experience(text):
    text_lower = str(text).lower()
    start_idx = -1
    for header in EXPERIENCE_HEADERS:
        match = re.search(rf'\b{header}\b', text_lower)
        if match:
            if start_idx == -1 or match.start() < start_idx:
                start_idx = match.start()
    
    if start_idx == -1: return None 

    search_text = text_lower[start_idx:]
    end_idx = len(text_lower)
    for header in NEXT_SECTION_HEADERS:
        match = re.search(rf'\b{header}\b', search_text)
        if match:
            real_match_idx = start_idx + match.start()
            if real_match_idx < end_idx:
                end_idx = real_match_idx

    return text[start_idx:end_idx].strip()

# ==========================================
# 3. ROBUST LABELING LOGIC (UPDATED)
# ==========================================
def get_label(text):
    if not text: return None
    t = text.lower()
    
    # 1. Count Keywords
    n_freelance = len(re.findall(FREELANCE_REGEX, t))
    n_professional = len(re.findall(PROFESSIONAL_REGEX, t)) 
    
    # 2. Count Intern keywords INTELLIGENTLY
    intern_matches = list(re.finditer(INTERN_REGEX, t))
    
    valid_intern_count = 0
    for match in intern_matches:
        start = match.start()
        # Look at the 50 characters BEFORE the word "intern"
        preceding_text = t[max(0, start-50):start]
        
        # Check if it was preceded by "managed", "mentored", etc.
        if re.search(IGNORE_INTERN_CONTEXT, preceding_text):
            continue 
        valid_intern_count += 1
    
    # --- CLASSIFICATION RULES ---
    
    # Rule A: Professional Override (Fixes Data Scientist / Engineer misclassification)
    if n_professional > 0:
        if n_freelance > n_professional: return 'Freelance'
        return 'Full-time'
        
    # Rule B: Freelance
    if n_freelance > 0 and n_freelance >= valid_intern_count: 
        return 'Freelance'
    
    # Rule C: Internship
    if valid_intern_count > 0: 
        return 'Internship'
    
    # Rule D: Default
    return 'Full-time'

# ==========================================
# 4. PROCESSING (NO BALANCING)
# ==========================================
if __name__ == "__main__":
    df = load_data()
    
    print("\n>>> Step 2: Extracting Experience Sections...")
    df['extracted_text'] = df['text'].apply(extract_experience)
    df_clean = df.dropna(subset=['extracted_text'])
    df_clean = df_clean[df_clean['extracted_text'].str.len() > 50]
    
    print("\n>>> Step 3: Applying ROBUST Labels...")
    df_clean['label'] = df_clean['extracted_text'].apply(get_label)
    
    print("   - Final Natural Distribution:")
    print(df_clean['label'].value_counts())
    
    # Preparing final dataframe (Renaming extracted_text to text)
    df_training = df_clean[['extracted_text', 'label']].rename(columns={'extracted_text': 'text'})
    
    # Save
    filename = "robust_experience_training_data_unbalanced.csv"
    df_training.to_csv(filename, index=False)
    print(f"\n>>> DONE: Saved {len(df_training)} samples to '{filename}'")

>>> Step 1: Loading Raw Data...
   - Total Raw Resumes: 34144

>>> Step 2: Extracting Experience Sections...

>>> Step 3: Applying ROBUST Labels...
   - Final Natural Distribution:
label
Full-time     32615
Freelance       415
Internship      258
Name: count, dtype: int64

>>> DONE: Saved 33288 samples to 'robust_experience_training_data_unbalanced.csv'


In [2]:
import pandas as pd
import os

# Check if the file from the previous step exists
filename = "robust_experience_training_data.csv"

if os.path.exists(filename):
    print(f"Loading '{filename}'...")
    df = pd.read_csv(filename)
    
    # --- VISUALIZATION FUNCTION ---
    def view_samples(category):
        print("\n" + "="*80)
        print(f"  CATEGORY: {category.upper()}")
        print("="*80)
        
        # Get random samples for this category
        subset = df[df['label'] == category]
        
        if subset.empty:
            print("  (No samples found)")
            return

        samples = subset.sample(n=min(3, len(subset)), random_state=42)
        
        for i, row in enumerate(samples.itertuples()):
            print(f"\nSample #{i+1}:")
            print("-" * 20)
            
            # Show the first 400 characters of the extracted text
            text_preview = str(row.text)[:400].replace('\n', ' ')
            print(f"\"{text_preview}...\"")
            print("-" * 20)

    # --- RUN FOR EACH CLASS ---
    view_samples('Freelance')
    view_samples('Internship')
    view_samples('Full-time')

else:
    print(f"Error: '{filename}' not found.")
    print("Please run the 'Final Data Preparation' script (previous step) first to generate the file.")

Loading 'robust_experience_training_data.csv'...

  CATEGORY: FREELANCE

Sample #1:
--------------------
"experience in IT industry and 3 years of experience in ServiceNow Platform Over 6 years of experience as a QA consultant and was responsible for testing efforts for implementation of all RMS change requests and supported multiple..."
--------------------

Sample #2:
--------------------
"Experience 01 1996 to Current Consultant Company Name City State Expanded new business opportunities in Texas for Program Management firm Developed contacts with Owners and Architects to develop relationships and solicit project possibilities Provided Project Management and Cost Consulting Services to Owners and Architects on major Higher..."
--------------------

Sample #3:
--------------------
"experience in IT industry and 3 years of experience in ServiceNow Platform Over 6 years of experience as a QA consultant and was responsible for testing efforts for implementation of all RMS change request

suppose for data balancing

In [42]:
import pandas as pd
from sklearn.utils import resample
import nlpaug.augmenter.word as naw
import os
from tqdm import tqdm  # For a progress bar

# --- CONFIGURATION ---
INPUT_FILE = "robust_experience_training_data_unbalanced.csv"
OUTPUT_FILE = "robust_experience_training_data_bert_augmented.csv"
TARGET_PER_CLASS = 3334

# --- LOAD BERT AUGMENTER ---
# This downloads a small BERT model specifically for inserting contextual words
print("Loading BERT model for augmentation (this happens once)...")
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', 
    action="substitute",
    device='cpu' # Change to 'cuda' if you have a GPU
)

# --- MAIN PROCESS ---
if not os.path.exists(INPUT_FILE):
    print(f"Error: {INPUT_FILE} not found.")
else:
    print("Loading data...")
    df = pd.read_csv(INPUT_FILE)
    
    # 1. Downsample Majority (Full-time) - No augmentation needed here
    print("Downsampling Full-time data...")
    df_fulltime = df[df['label'] == 'Full-time']
    df_fulltime_bal = resample(df_fulltime, replace=False, n_samples=TARGET_PER_CLASS, random_state=42)

    # 2. Augment Minorities (Freelance & Internship)
    augmented_dfs = [df_fulltime_bal]
    
    for category in ['Freelance', 'Internship']:
        subset = df[df['label'] == category]
        current_count = len(subset)
        needed = TARGET_PER_CLASS - current_count
        
        print(f"\n--- Augmenting {category} ---")
        print(f"Original: {current_count} | Needed: {needed}")
        
        new_samples = []
        
        # We use a progress bar because BERT is slower than NLTK
        with tqdm(total=needed) as pbar:
            while len(new_samples) < needed:
                # Iterate through original samples
                for text in subset['text']:
                    if len(new_samples) >= needed: break
                    
                    # Generate Augmented Text
                    # aug.augment returns a list, we take the first item
                    augmented_text = aug.augment(str(text))
                    
                    # Sometimes it returns a list, handle that:
                    if isinstance(augmented_text, list):
                        augmented_text = augmented_text[0]
                    
                    # Add to list if it's not identical (BERT rarely produces identicals)
                    if augmented_text != text:
                        new_samples.append({'label': category, 'text': augmented_text})
                        pbar.update(1)

        df_synthetic = pd.DataFrame(new_samples)
        df_combined = pd.concat([subset, df_synthetic])
        augmented_dfs.append(df_combined)

    # 3. Final Combine & Save
    df_final = pd.concat(augmented_dfs)
    df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n" + "="*30)
    print("FINAL DATASET STATS")
    print("="*30)
    print(df_final['label'].value_counts())
    
    df_final.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSaved HIGH-QUALITY dataset to '{OUTPUT_FILE}'")

Loading BERT model for augmentation (this happens once)...
Loading data...
Downsampling Full-time data...

--- Augmenting Freelance ---
Original: 415 | Needed: 2919


 10%|▉         | 283/2919 [09:10<1:25:31,  1.95s/it]


KeyboardInterrupt: 

full training mode

In [41]:
import pandas as pd
import numpy as np
import torch
import nlpaug.augmenter.word as naw
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import os
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_FILE = "robust_experience_training_data_unbalanced.csv" # USE THE ORIGINAL UNBALANCED FILE
SAVE_PATH = "./saved_bert_model_v2"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 4
TARGET_PER_CLASS = 3000 # Augment up to this number
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. LOAD & SPLIT FIRST (CRITICAL STEP) ---
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError("Please provide the original 'unbalanced' csv file.")

print("1. Loading Data & Splitting...")
df = pd.read_csv(INPUT_FILE)

# Encode Labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
classes = le.classes_
print(f"Classes: {classes}")

# SPLIT FIRST! Keep 20% pure for testing.
X_train_raw, X_test, y_train_raw, y_test = train_test_split(
    df['text'].values, 
    df['label_encoded'].values, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label_encoded']
)

# Reassemble Training Set for Augmentation
train_df = pd.DataFrame({'text': X_train_raw, 'label_encoded': y_train_raw})
train_df['label'] = le.inverse_transform(train_df['label_encoded'])

print(f"Training Samples (Raw): {len(train_df)}")
print(f"Test Samples (Pure): {len(X_test)}")

# --- 2. AUGMENT ONLY TRAINING DATA ---
print("\n2. Augmenting Training Data (BERT)...")
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device='cuda' if torch.cuda.is_available() else 'cpu')

augmented_dfs = []

for class_name in classes:
    # Get all samples for this class
    subset = train_df[train_df['label'] == class_name]
    
    # If Majority Class (Full-time), Downsample
    if len(subset) > TARGET_PER_CLASS:
        subset = resample(subset, replace=False, n_samples=TARGET_PER_CLASS, random_state=42)
        augmented_dfs.append(subset)
    
    # If Minority Class, Augment
    else:
        # Add original samples first
        augmented_dfs.append(subset)
        
        needed = TARGET_PER_CLASS - len(subset)
        print(f"   Augmenting {class_name}: Creating {needed} new samples...")
        
        new_texts = []
        original_texts = subset['text'].tolist()
        
        # Cycle through originals to create new ones
        while len(new_texts) < needed:
            for text in original_texts:
                if len(new_texts) >= needed: break
                try:
                    # Augment
                    aug_text = aug.augment(str(text))
                    if isinstance(aug_text, list): aug_text = aug_text[0]
                    if aug_text != text:
                        new_texts.append(aug_text)
                except:
                    continue
        
        # Add new synthetic samples
        temp_df = pd.DataFrame({'text': new_texts})
        temp_df['label'] = class_name
        temp_df['label_encoded'] = le.transform([class_name])[0]
        augmented_dfs.append(temp_df)

# Combine
df_train_final = pd.concat(augmented_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Final Training Set Size: {len(df_train_final)}")
print(df_train_final['label'].value_counts())

# --- 3. PREPARE DATASETS ---
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length',
            truncation=True, return_attention_mask=True, return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = ResumeDataset(df_train_final['text'].values, df_train_final['label_encoded'].values, tokenizer, MAX_LEN)
test_dataset = ResumeDataset(X_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# --- 4. TRAIN ---
print("\n3. Training BERT...")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(classes))
model = model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"   Avg Loss: {total_loss / len(train_loader)}")

# --- 5. EVALUATE ---
print("\n4. Final Evaluation (On Pure Unseen Data)...")
model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=mask)
        _, prediction = torch.max(outputs.logits, dim=1)
        preds.extend(prediction.cpu().numpy())
        true_labels.extend(batch['labels'].numpy())

print(classification_report(true_labels, preds, target_names=classes))

# Save
if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH)
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
np.save(os.path.join(SAVE_PATH, 'classes.npy'), classes)
print("Model saved.")

1. Loading Data & Splitting...
Classes: ['Freelance' 'Full-time' 'Internship']
Training Samples (Raw): 26630
Test Samples (Pure): 6658

2. Augmenting Training Data (BERT)...
   Augmenting Freelance: Creating 2668 new samples...
   Augmenting Internship: Creating 2794 new samples...
Final Training Set Size: 9000
label
Internship    3000
Freelance     3000
Full-time     3000
Name: count, dtype: int64

3. Training BERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4


100%|██████████| 563/563 [04:06<00:00,  2.28it/s]


   Avg Loss: 0.35194173762591335
Epoch 2/4


100%|██████████| 563/563 [03:58<00:00,  2.36it/s]


   Avg Loss: 0.10765275600172751
Epoch 3/4


100%|██████████| 563/563 [04:01<00:00,  2.33it/s]


   Avg Loss: 0.0458391964432269
Epoch 4/4


100%|██████████| 563/563 [04:00<00:00,  2.34it/s]


   Avg Loss: 0.025387587201315082

4. Final Evaluation (On Pure Unseen Data)...
              precision    recall  f1-score   support

   Freelance       0.30      0.78      0.43        83
   Full-time       1.00      0.96      0.98      6523
  Internship       0.29      0.88      0.44        52

    accuracy                           0.96      6658
   macro avg       0.53      0.88      0.62      6658
weighted avg       0.98      0.96      0.97      6658

Model saved.


this part it will prepare for testing

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import os

# --- CONFIGURATION ---
MODEL_PATH = "./saved_bert_model_v2"
MAX_LEN = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- LOAD MODEL & TOKENIZER ---
if not os.path.exists(MODEL_PATH):
    print(f"Error: Model not found at {MODEL_PATH}. Did you run the training cell?")
else:
    print(f"Loading BERT model from {MODEL_PATH}...")
    
    # Load architecture and weights
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
    model = model.to(device)
    model.eval() # Freeze for inference

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

    # Load class names
    classes = np.load(os.path.join(MODEL_PATH, 'classes.npy'), allow_pickle=True)
    
    print(f"✅ Model Loaded Successfully!")
    print(f"Using Device: {device}")
    print(f"Classes: {classes}")

Loading BERT model from ./saved_bert_model...
✅ Model Loaded Successfully!
Using Device: cuda
Classes: ['Freelance' 'Full-time' 'Internship']


In [None]:
def predict_resume(text):
    """
    Accepts a resume string and returns the predicted label and confidence score.
    """
    if not text: return "Empty", 0.0

    # 1. Tokenize
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # 2. Move to GPU/CPU
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # 3. Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
    
    # 4. Decode Result
    confidence, prediction_idx = torch.max(probs, dim=1)
    predicted_label = classes[prediction_idx.item()]
    confidence_score = confidence.item()

    return predicted_label, confidence_score

In [40]:
# --- LIST OF TEST CASES ---
test_cases = [
    # --- INTERNSHIP EXAMPLES ---
    "I spent my summer assisting the backend team with API documentation and minor bug fixes.",
    "Shadowed the Senior UX Designer and helped conduct user research surveys for 3 months.",
    "Part of the university co-op program, working on data entry and basic SQL queries.",
    
    # --- FREELANCE EXAMPLES ---
    "I take on various graphic design projects on Upwork, managing my own schedule and clients.",

    
    # --- FULL-TIME EXAMPLES ---
    "Served as the Lead Developer for 4 years, managing a team of 10 engineers.",
    "Responsible for the end-to-end deployment of the company's main payment gateway.",
    "Employed since 2018 as a Senior Analyst, handling daily operations and quarterly reporting.",
    
    # --- TRICKY / AMBIGUOUS EXAMPLES (The real test!) ---
    "I worked for 6 months covering a maternity leave, handling full server access.", 
    # ^ (Could be Full-time or Contract/Freelance)
    
    "I built the entire mobile app by myself during the weekends while studying.", 
    # ^ (Likely Freelance or Personal Project, shouldn't be Full-time)
    
    "Junior developer responsible for updating the UI, reporting to the CTO." 
    # ^ (Context implies Full-time, but description is simple)
]

# --- RUN BATCH PREDICTION ---
print(f"{'PREDICTION':<15} | {'CONF.':<8} | {'RESUME TEXT'}")
print("="*90)

for text in test_cases:
    label, score = predict_resume(text)
    
    # Color coding for easier reading (optional)
    # 90%+ confidence = High, <70% = Low
    indicator = "✅" if score > 0.9 else "⚠️"
    
    print(f"{label.upper():<15} | {score*100:.1f}% {indicator} | \"{text[:60]}...\"")

PREDICTION      | CONF.    | RESUME TEXT
FULL-TIME       | 100.0% ✅ | "I spent my summer assisting the backend team with API docume..."
FULL-TIME       | 80.5% ⚠️ | "Shadowed the Senior UX Designer and helped conduct user rese..."
INTERNSHIP      | 100.0% ✅ | "Part of the university co-op program, working on data entry ..."
FULL-TIME       | 100.0% ✅ | "I take on various graphic design projects on Upwork, managin..."
FULL-TIME       | 100.0% ✅ | "Served as the Lead Developer for 4 years, managing a team of..."
FULL-TIME       | 100.0% ✅ | "Responsible for the end-to-end deployment of the company's m..."
FULL-TIME       | 99.9% ✅ | "Employed since 2018 as a Senior Analyst, handling daily oper..."
FULL-TIME       | 90.2% ✅ | "I worked for 6 months covering a maternity leave, handling f..."
FULL-TIME       | 93.5% ✅ | "I built the entire mobile app by myself during the weekends ..."
FULL-TIME       | 100.0% ✅ | "Junior developer responsible for updating the UI, reporting ..."
