In [1]:
import os
import re
import pandas as pd
import kagglehub
import torch
from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
# --- CONFIGURATION ---
EXPERIENCE_HEADERS = [
    r'professional experience', r'work experience', r'employment history',
    r'work history', r'experience', r'career history', r'professional background'
]

NEXT_SECTION_HEADERS = [
    r'education', r'academic background', r'skills', r'technical skills',
    r'projects', r'personal projects', r'certifications', r'achievements',
    r'references', r'languages', r'volunteer', r'interests'
]

# ==========================================
# 1. INITIALIZE ZERO-SHOT AI
# ==========================================
print(">>> Step 0: Loading AI Model...")
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    "zero-shot-classification", 
    model="facebook/bart-large-mnli", 
    device=device
)

# Descriptive labels help the AI understand context better
CANDIDATE_LABELS = [
    "Full-time employee with professional experience", 
    "Freelance contractor or self-employed consultant", 
    "Student intern or trainee"
]

# Map back to your simple categories
LABEL_MAP = {
    "Full-time employee with professional experience": "Full-time",
    "Freelance contractor or self-employed consultant": "Freelance",
    "Student intern or trainee": "Internship"
}

# ==========================================
# 2. LOAD DATA
# ==========================================
def load_data():
    print(">>> Step 1: Loading Raw Data...")
    dfs = []
    
    # Kaggle
    try:
        path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith(".csv"):
                    df = pd.read_csv(os.path.join(root, file))
                    if 'Resume_str' in df.columns:
                        dfs.append(df[['Resume_str']].rename(columns={'Resume_str': 'text'}))
                    break
    except: pass

    # # Hugging Face
    # try:
    #     dataset = load_dataset("InferencePrince555/Resume-Dataset")
    #     df = dataset['train'].to_pandas()
    #     col = 'Resume_test' if 'Resume_test' in df.columns else df.columns[0]
    #     dfs.append(df[[col]].rename(columns={col: 'text'}))
    # except: pass

    # if not dfs: return pd.DataFrame(columns=['text'])
    
    df_combined = pd.concat(dfs, ignore_index=True).dropna().drop_duplicates(subset=['text'])
    print(f"   - Total Raw Resumes: {len(df_combined)}")
    return df_combined

# ==========================================
# 3. EXTRACT EXPERIENCE SECTION
# ==========================================
def extract_experience(text):
    text_lower = str(text).lower()
    start_idx = -1
    for header in EXPERIENCE_HEADERS:
        match = re.search(rf'\b{header}\b', text_lower)
        if match:
            if start_idx == -1 or match.start() < start_idx:
                start_idx = match.start()
    
    if start_idx == -1: return None 

    search_text = text_lower[start_idx:]
    end_idx = len(text_lower)
    for header in NEXT_SECTION_HEADERS:
        match = re.search(rf'\b{header}\b', search_text)
        if match:
            real_match_idx = start_idx + match.start()
            if real_match_idx < end_idx:
                end_idx = real_match_idx

    return text[start_idx:end_idx].strip()

# ==========================================
# 4. AI LABELING FUNCTION
# ==========================================
def get_ai_label_batch(texts):
    """
    Runs classification on a list of texts (Batch Processing)
    """
    results = classifier(texts, CANDIDATE_LABELS, multi_label=False)
    
    final_labels = []
    # Handle single result vs list of results
    if isinstance(results, dict): results = [results]
        
    for res in results:
        top_label = res['labels'][0]
        final_labels.append(LABEL_MAP[top_label])
        
    return final_labels

# ==========================================
# 5. MAIN EXECUTION
# ==========================================

# ==========================================
# 5. MAIN EXECUTION (OPTIMIZED)
# ==========================================
if __name__ == "__main__":
    df = load_data()
    
    print("\n>>> Step 2: Extracting Experience Sections...")
    df['extracted_text'] = df['text'].apply(extract_experience)
    
    # Filter valid rows & clean up
    df_clean = df.dropna(subset=['extracted_text']).copy()
    df_clean = df_clean[df_clean['extracted_text'].str.len() > 50]
    
    # Optional: Reset index to prevent issues with Dataset conversion
    df_clean = df_clean.reset_index(drop=True)

    print(f"\n>>> Step 3: AI Labeling ({len(df_clean)} samples) using Optimized Dataset Pipeline...")

    # 1. Convert Pandas DF to Hugging Face Dataset
    hf_dataset = Dataset.from_pandas(df_clean[['extracted_text']])

    # 2. Run Pipeline efficiently
    # KeyDataset tells the pipeline to look at the "extracted_text" column
    # batch_size=16 works well for 16GB VRAM. Reduce to 8 if you get OOM errors.
    results = classifier(
        KeyDataset(hf_dataset, "extracted_text"),
        candidate_labels=CANDIDATE_LABELS,
        multi_label=False,
        batch_size=16, 
        truncation=True  # Automatically handles texts longer than 1024 tokens
    )

    # 3. Collect Results
    ai_labels = []
    # iterating over 'results' automatically triggers the progress bar internally if supported,
    # or we can wrap it in tqdm
    for res in tqdm(results, total=len(hf_dataset)):
        top_label = res['labels'][0]
        ai_labels.append(LABEL_MAP[top_label])

    # 4. Save
    df_clean['label'] = ai_labels
    
    print("\n   - Final AI Distribution:")
    print(df_clean['label'].value_counts())
    
    df_training = df_clean[['extracted_text', 'label']].rename(columns={'extracted_text': 'text'})
    filename = "ai_labeled_experience_data.csv"
    df_training.to_csv(filename, index=False)
    print(f"\n>>> DONE: Saved to '{filename}'")

>>> Step 0: Loading AI Model...



Device set to use cuda:0


>>> Step 1: Loading Raw Data...
   - Total Raw Resumes: 2482

>>> Step 2: Extracting Experience Sections...

>>> Step 3: AI Labeling (2380 samples) using Optimized Dataset Pipeline...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2380/2380 [30:50<00:00,  1.29it/s]



   - Final AI Distribution:
label
Full-time     2114
Freelance      152
Internship     114
Name: count, dtype: int64

>>> DONE: Saved to 'ai_labeled_experience_data.csv'


In [5]:
import pandas as pd
import os

# Check if the file from the previous step exists
filename = "robust_experience_training_data.csv"

if os.path.exists(filename):
    print(f"Loading '{filename}'...")
    df = pd.read_csv(filename)
    
    # --- VISUALIZATION FUNCTION ---
    def view_samples(category):
        print("\n" + "="*80)
        print(f"  CATEGORY: {category.upper()}")
        print("="*80)
        
        # Get random samples for this category
        subset = df[df['label'] == category]
        
        if subset.empty:
            print("  (No samples found)")
            return

        samples = subset.sample(n=min(3, len(subset)), random_state=42)
        
        for i, row in enumerate(samples.itertuples()):
            print(f"\nSample #{i+1}:")
            print("-" * 20)
            
            # Show the first 400 characters of the extracted text
            text_preview = str(row.text)[:400].replace('\n', ' ')
            print(f"\"{text_preview}...\"")
            print("-" * 20)

    # --- RUN FOR EACH CLASS ---
    view_samples('Freelance')
    view_samples('Internship')
    view_samples('Full-time')

else:
    print(f"Error: '{filename}' not found.")
    print("Please run the 'Final Data Preparation' script (previous step) first to generate the file.")

Loading 'robust_experience_training_data.csv'...

  CATEGORY: FREELANCE

Sample #1:
--------------------
"experience in IT industry and 3 years of experience in ServiceNow Platform Over 6 years of experience as a QA consultant and was responsible for testing efforts for implementation of all RMS change requests and supported multiple..."
--------------------

Sample #2:
--------------------
"Experience 01 1996 to Current Consultant Company Name City State Expanded new business opportunities in Texas for Program Management firm Developed contacts with Owners and Architects to develop relationships and solicit project possibilities Provided Project Management and Cost Consulting Services to Owners and Architects on major Higher..."
--------------------

Sample #3:
--------------------
"experience in IT industry and 3 years of experience in ServiceNow Platform Over 6 years of experience as a QA consultant and was responsible for testing efforts for implementation of all RMS change request

suppose for data balancing

In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import numpy as np

# ==========================================
# 1. CONFIGURATION
# ==========================================
INPUT_FILE = "ai_labeled_experience_data.csv"
OUTPUT_FILE = "generative_balanced_data.csv"
TARGET_COUNT = 2114  # Target size for minority classes

# We use a T5 model fine-tuned specifically for Paraphrasing
# This is much safer than raw GPT generation because it stays grounded in your text
MODEL_NAME = "Vamsi/T5_Paraphrase_Paws" 

device = "cuda" if torch.cuda.is_available() else "cpu"

# ==========================================
# 2. INITIALIZE GENERATIVE MODEL
# ==========================================
print(f"üöÄ Loading T5 Generative Model on {device.upper()}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

def generate_paraphrase(text, num_return_sequences=1):
    """
    Uses T5 to rewrite the resume text completely.
    """
    # T5 requires the prefix "paraphrase: " to know what task to do
    text = "paraphrase: " + text + " </s>"

    encoding = tokenizer.encode_plus(
        text, 
        padding="longest", 
        return_tensors="pt"
    )
    
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)

    # Generate
    outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_masks,
        max_length=512,
        do_sample=True, # Creativity enabled
        top_k=120,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=num_return_sequences
    )

    results = []
    for output in outputs:
        line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        results.append(line)
        
    return results

# ==========================================
# 3. AUGMENTATION LOOP
# ==========================================
def augment_class_generative(df, label, target_count):
    existing_data = df[df['label'] == label]
    current_count = len(existing_data)
    
    if current_count >= target_count:
        print(f"‚úÖ {label} is sufficient. Trimming.")
        return existing_data.sample(target_count, random_state=42)
    
    needed = target_count - current_count
    print(f"‚ö° Generative Augmentation for {label}: Generating {needed} new samples...")
    
    texts = existing_data['text'].tolist()
    new_rows = []
    
    # We loop until we have enough data
    pbar = tqdm(total=needed)
    while len(new_rows) < needed:
        # Pick a random text
        original = np.random.choice(texts)
        
        # Don't try to paraphrase huge blocks, T5 works best on sentences/paragraphs
        # We truncate to ~500 chars for speed and accuracy
        if len(original) > 500: original = original[:500]
        
        try:
            # Generate a new variation
            new_text = generate_paraphrase(original, num_return_sequences=1)[0]
            
            # Basic check to ensure it's not identical
            if new_text.lower() != original.lower():
                new_rows.append({'text': new_text, 'label': label})
                pbar.update(1)
        except:
            continue
            
    pbar.close()
    
    synthetic_df = pd.DataFrame(new_rows)
    combined_df = pd.concat([existing_data, synthetic_df], ignore_index=True)
    
    return combined_df.iloc[:target_count]

# ==========================================
# 4. MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    df = pd.read_csv(INPUT_FILE)
    
    dfs = []
    # 1. Full-time (Keep as is/Trim)
    dfs.append(augment_class_generative(df, "Full-time", TARGET_COUNT))
    
    # 2. Freelance (Generate)
    dfs.append(augment_class_generative(df, "Freelance", TARGET_COUNT))
    
    # 3. Internship (Generate)
    dfs.append(augment_class_generative(df, "Internship", TARGET_COUNT))
    
    final_df = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)
    
    print("\n‚úÖ Final Generative Distribution:")
    print(final_df['label'].value_counts())
    
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"üíæ Saved high-quality data to {OUTPUT_FILE}")

üöÄ Loading T5 Generative Model on CUDA...


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Exception in thread Thread-4:
Traceback (most recent call last):
  File "c:\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "c:\Python311\Lib\site-packages\tqdm\_monitor.py", line 84, in run
    instance.refresh(nolock=True)
  File "c:\Python311\Lib\site-packages\tqdm\std.py", line 1347, in refresh
    self.display()
  File "c:\Python311\Lib\site-packages\tqdm\notebook.py", line 171, in display
    rtext.value = right
    ^^^^^^^^^^^
  File "C:\Users\charls\AppData\Roaming\Python\Python311\site-packages\traitlets\traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "C:\Users\charls\AppData\Roaming\Python\Python311\site-packages\traitlets\traitlets.py", line 706, in set
    obj._notify_trait(self.name, old_value, new_value)
  File "C:\Users\charls\AppData\Roaming\Python\Python311\site-packages\traitlets\traitlets.py", line 1513, in _notify_trait
    self.notify_change(
  File "C:\Users\charls\AppData\Roaming\Python\Python311\site-packages

‚ö° Generative Augmentation for Full-time: Generating 1 new samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:05<00:00,  5.32s/it]


‚ö° Generative Augmentation for Freelance: Generating 1962 new samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1962/1962 [1:44:23<00:00,  3.19s/it]  


‚ö° Generative Augmentation for Internship: Generating 2000 new samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [35:00<00:00,  1.05s/it]



‚úÖ Final Generative Distribution:
label
Full-time     2114
Internship    2114
Freelance     2114
Name: count, dtype: int64
üíæ Saved high-quality data to generative_balanced_data.csv


In [7]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
from datasets import Dataset

# ==========================================
# 1. CONFIGURATION
# ==========================================
INPUT_FILE = "generative_balanced_data.csv"
OUTPUT_DIR = "./saved_bert_model_final"
MODEL_CHECKPOINT = "distilbert-base-uncased"  # Fast & Accurate. Use "bert-base-uncased" for max accuracy.
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LEN = 128

# Label Mapping (Must match your classes exactly)
id2label = {0: "Freelance", 1: "Full-time", 2: "Internship"}
label2id = {"Freelance": 0, "Full-time": 1, "Internship": 2}

# ==========================================
# 2. DATA PREPARATION
# ==========================================
def load_and_prepare_data():
    print("‚è≥ Loading balanced dataset...")
    df = pd.read_csv(INPUT_FILE)
    
    # Shuffle
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Map text labels to integers
    df['label_id'] = df['label'].map(label2id)
    
    # Split Train/Test (80/20)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_id'])
    
    print(f"‚úÖ Training Samples: {len(train_df)}")
    print(f"‚úÖ Validation Samples: {len(val_df)}")
    
    # Convert to Hugging Face Datasets
    train_ds = Dataset.from_pandas(train_df[['text', 'label_id']])
    val_ds = Dataset.from_pandas(val_df[['text', 'label_id']])
    
    return train_ds, val_ds

# ==========================================
# 3. TOKENIZATION
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    # Truncation is vital for BERT
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=MAX_LEN)

# ==========================================
# 4. METRICS FUNCTION
# ==========================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# ==========================================
# 5. MAIN TRAINING PIPELINE
# ==========================================
if __name__ == "__main__":
    # A. Load Data
    train_dataset, val_dataset = load_and_prepare_data()

    # B. Tokenize
    print("‚è≥ Tokenizing data...")
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)
    
    # Rename 'label_id' to 'labels' (Required by HF Trainer)
    tokenized_train = tokenized_train.rename_column("label_id", "labels")
    tokenized_val = tokenized_val.rename_column("label_id", "labels")
    
    # Remove unnecessary columns
    tokenized_train = tokenized_train.remove_columns(["text", "__index_level_0__"])
    tokenized_val = tokenized_val.remove_columns(["text", "__index_level_0__"])

    # C. Data Collator (Dynamic Padding)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # D. Initialize Model
    print(f"üöÄ Initializing Model: {MODEL_CHECKPOINT}")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, 
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    )

    # E. Training Arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none" # Disable WandB logging
    )

    # F. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # G. TRAIN!
    print("\nüî• STARTING TRAINING...")
    trainer.train()

    # H. Evaluate
    print("\nüìä FINAL EVALUATION:")
    eval_results = trainer.evaluate()
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")

    # I. Save Final Model
    print(f"\nüíæ Saving model to {OUTPUT_DIR}...")
    trainer.save_model(OUTPUT_DIR)
    
    # Also save the classes.npy for your inference script
    np.save(f"{OUTPUT_DIR}/classes.npy", np.array(["Freelance", "Full-time", "Internship"]))
    print("‚úÖ Training Complete! You can now use this model in your ranking pipeline.")

‚è≥ Loading balanced dataset...
‚úÖ Training Samples: 5073
‚úÖ Validation Samples: 1269
‚è≥ Tokenizing data...


Map:   0%|          | 0/5073 [00:00<?, ? examples/s]

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

üöÄ Initializing Model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



üî• STARTING TRAINING...


  0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

{'eval_loss': 0.2584383487701416, 'eval_accuracy': 0.91725768321513, 'eval_runtime': 4.8488, 'eval_samples_per_second': 261.713, 'eval_steps_per_second': 16.499, 'epoch': 1.0}
{'loss': 0.4141, 'grad_norm': 1.7056277990341187, 'learning_rate': 9.517819706498952e-06, 'epoch': 1.57}


  0%|          | 0/80 [00:00<?, ?it/s]

{'eval_loss': 0.12816651165485382, 'eval_accuracy': 0.9613869188337274, 'eval_runtime': 5.4047, 'eval_samples_per_second': 234.797, 'eval_steps_per_second': 14.802, 'epoch': 2.0}


  0%|          | 0/80 [00:00<?, ?it/s]

{'eval_loss': 0.11577248573303223, 'eval_accuracy': 0.9676910953506698, 'eval_runtime': 5.3835, 'eval_samples_per_second': 235.722, 'eval_steps_per_second': 14.86, 'epoch': 3.0}
{'train_runtime': 221.2839, 'train_samples_per_second': 68.776, 'train_steps_per_second': 4.311, 'train_loss': 0.27179700023723097, 'epoch': 3.0}

üìä FINAL EVALUATION:


  0%|          | 0/80 [00:00<?, ?it/s]

Accuracy: 0.9677

üíæ Saving model to ./saved_bert_model_final...
‚úÖ Training Complete! You can now use this model in your ranking pipeline.


full training mode

In [14]:
import torch
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

# ==========================================
# 1. CONFIGURATION
# ==========================================
MODEL_PATH = "./saved_bert_model_final"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ExperienceClassifier:
    def __init__(self, model_dir):
        print(f"‚è≥ Loading model from {model_dir} on {DEVICE}...")
        
        # Load Model & Tokenizer
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)
            self.model.to(DEVICE)
            self.model.eval() # Set to evaluation mode
            
            # Load Class Labels (Saved during training)
            classes_file = os.path.join(model_dir, "classes.npy")
            if os.path.exists(classes_file):
                self.labels = np.load(classes_file, allow_pickle=True)
            else:
                print("‚ö†Ô∏è Warning: classes.npy not found. Using default labels.")
                self.labels = ["Freelance", "Full-time", "Internship"]
                
            print("‚úÖ Model loaded successfully!")
            
        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            self.model = None

    def predict(self, text):
        if not self.model: return "Error", 0.0

        # Tokenize
        inputs = self.tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True, 
            padding=True, 
            max_length=128
        ).to(DEVICE)

        # Inference
        with torch.no_grad():
            logits = self.model(**inputs).logits
            probs = F.softmax(logits, dim=1)
            
        # Get Top Prediction
        conf, idx = torch.max(probs, dim=1)
        return self.labels[idx.item()], conf.item()

# ==========================================
# 2. RUN TESTS
# ==========================================
if __name__ == "__main__":
    # Initialize Classifier
    classifier = ExperienceClassifier(MODEL_PATH)

    # Test Cases (Mix of clear and tricky examples)
    test_cases = [
        # Full-time examples
        "fulltime Software Engineer at Google leading a team of 10 developers.",
        "Staff Accountant managing month-end close and general ledger.",
        
        # Freelance examples
        "Built a custom WordPress site for a local bakery as a one-off project.",
        "Self-employed graphic designer working with various clients on Upwork.",
        
        # Internship examples
        "Summer Intern assisting the marketing team with social media campaigns.",
        "Engineering Student Trainee shadowing senior developers.",
        
        # Tricky / Ambiguous examples
        "Contract role to fix specific bugs in the payment gateway (3 months).",
        "Founder of a small startup building an iOS app."
    ]

    print(f"\n{'TEXT SAMPLE':<60} | {'PREDICTION':<12} | {'CONFIDENCE'}")
    print("-" * 90)

    for text in test_cases:
        label, score = classifier.predict(text)
        
        # Visual confidence bar
        bar = "‚ñà" * int(score * 10)
        
        print(f"{text[:57] + '...':<60} | {label:<12} | {score:.1%} {bar}")

‚è≥ Loading model from ./saved_bert_model_final on cuda...
‚úÖ Model loaded successfully!

TEXT SAMPLE                                                  | PREDICTION   | CONFIDENCE
------------------------------------------------------------------------------------------
fulltime Software Engineer at Google leading a team of 10... | Freelance    | 99.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Staff Accountant managing month-end close and general led... | Freelance    | 99.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Built a custom WordPress site for a local bakery as a one... | Freelance    | 99.5% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Self-employed graphic designer working with various clien... | Freelance    | 99.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Summer Intern assisting the marketing team with social me... | Internship   | 94.8% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Engineering Student Trainee shadowing senior developers....  | Internship   | 99.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Contract role to fix specific bugs in the payment gateway

In [41]:
import pandas as pd
import numpy as np
import torch
import nlpaug.augmenter.word as naw
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import os
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_FILE = "robust_experience_training_data_unbalanced.csv" # USE THE ORIGINAL UNBALANCED FILE
SAVE_PATH = "./saved_bert_model_v2"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 4
TARGET_PER_CLASS = 3000 # Augment up to this number
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. LOAD & SPLIT FIRST (CRITICAL STEP) ---
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError("Please provide the original 'unbalanced' csv file.")

print("1. Loading Data & Splitting...")
df = pd.read_csv(INPUT_FILE)

# Encode Labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
classes = le.classes_
print(f"Classes: {classes}")

# SPLIT FIRST! Keep 20% pure for testing.
X_train_raw, X_test, y_train_raw, y_test = train_test_split(
    df['text'].values, 
    df['label_encoded'].values, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label_encoded']
)

# Reassemble Training Set for Augmentation
train_df = pd.DataFrame({'text': X_train_raw, 'label_encoded': y_train_raw})
train_df['label'] = le.inverse_transform(train_df['label_encoded'])

print(f"Training Samples (Raw): {len(train_df)}")
print(f"Test Samples (Pure): {len(X_test)}")

# --- 2. AUGMENT ONLY TRAINING DATA ---
print("\n2. Augmenting Training Data (BERT)...")
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device='cuda' if torch.cuda.is_available() else 'cpu')

augmented_dfs = []

for class_name in classes:
    # Get all samples for this class
    subset = train_df[train_df['label'] == class_name]
    
    # If Majority Class (Full-time), Downsample
    if len(subset) > TARGET_PER_CLASS:
        subset = resample(subset, replace=False, n_samples=TARGET_PER_CLASS, random_state=42)
        augmented_dfs.append(subset)
    
    # If Minority Class, Augment
    else:
        # Add original samples first
        augmented_dfs.append(subset)
        
        needed = TARGET_PER_CLASS - len(subset)
        print(f"   Augmenting {class_name}: Creating {needed} new samples...")
        
        new_texts = []
        original_texts = subset['text'].tolist()
        
        # Cycle through originals to create new ones
        while len(new_texts) < needed:
            for text in original_texts:
                if len(new_texts) >= needed: break
                try:
                    # Augment
                    aug_text = aug.augment(str(text))
                    if isinstance(aug_text, list): aug_text = aug_text[0]
                    if aug_text != text:
                        new_texts.append(aug_text)
                except:
                    continue
        
        # Add new synthetic samples
        temp_df = pd.DataFrame({'text': new_texts})
        temp_df['label'] = class_name
        temp_df['label_encoded'] = le.transform([class_name])[0]
        augmented_dfs.append(temp_df)

# Combine
df_train_final = pd.concat(augmented_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Final Training Set Size: {len(df_train_final)}")
print(df_train_final['label'].value_counts())

# --- 3. PREPARE DATASETS ---
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length',
            truncation=True, return_attention_mask=True, return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = ResumeDataset(df_train_final['text'].values, df_train_final['label_encoded'].values, tokenizer, MAX_LEN)
test_dataset = ResumeDataset(X_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# --- 4. TRAIN ---
print("\n3. Training BERT...")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(classes))
model = model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"   Avg Loss: {total_loss / len(train_loader)}")

# --- 5. EVALUATE ---
print("\n4. Final Evaluation (On Pure Unseen Data)...")
model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=mask)
        _, prediction = torch.max(outputs.logits, dim=1)
        preds.extend(prediction.cpu().numpy())
        true_labels.extend(batch['labels'].numpy())

print(classification_report(true_labels, preds, target_names=classes))

# Save
if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH)
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
np.save(os.path.join(SAVE_PATH, 'classes.npy'), classes)
print("Model saved.")

1. Loading Data & Splitting...
Classes: ['Freelance' 'Full-time' 'Internship']
Training Samples (Raw): 26630
Test Samples (Pure): 6658

2. Augmenting Training Data (BERT)...
   Augmenting Freelance: Creating 2668 new samples...
   Augmenting Internship: Creating 2794 new samples...
Final Training Set Size: 9000
label
Internship    3000
Freelance     3000
Full-time     3000
Name: count, dtype: int64

3. Training BERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 563/563 [04:06<00:00,  2.28it/s]


   Avg Loss: 0.35194173762591335
Epoch 2/4


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 563/563 [03:58<00:00,  2.36it/s]


   Avg Loss: 0.10765275600172751
Epoch 3/4


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 563/563 [04:01<00:00,  2.33it/s]


   Avg Loss: 0.0458391964432269
Epoch 4/4


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 563/563 [04:00<00:00,  2.34it/s]


   Avg Loss: 0.025387587201315082

4. Final Evaluation (On Pure Unseen Data)...
              precision    recall  f1-score   support

   Freelance       0.30      0.78      0.43        83
   Full-time       1.00      0.96      0.98      6523
  Internship       0.29      0.88      0.44        52

    accuracy                           0.96      6658
   macro avg       0.53      0.88      0.62      6658
weighted avg       0.98      0.96      0.97      6658

Model saved.


this part it will prepare for testing

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import os

# --- CONFIGURATION ---
MODEL_PATH = "./saved_bert_model_v2"
MAX_LEN = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- LOAD MODEL & TOKENIZER ---
if not os.path.exists(MODEL_PATH):
    print(f"Error: Model not found at {MODEL_PATH}. Did you run the training cell?")
else:
    print(f"Loading BERT model from {MODEL_PATH}...")
    
    # Load architecture and weights
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
    model = model.to(device)
    model.eval() # Freeze for inference

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

    # Load class names
    classes = np.load(os.path.join(MODEL_PATH, 'classes.npy'), allow_pickle=True)
    
    print(f"‚úÖ Model Loaded Successfully!")
    print(f"Using Device: {device}")
    print(f"Classes: {classes}")

Loading BERT model from ./saved_bert_model...
‚úÖ Model Loaded Successfully!
Using Device: cuda
Classes: ['Freelance' 'Full-time' 'Internship']


In [None]:
def predict_resume(text):
    """
    Accepts a resume string and returns the predicted label and confidence score.
    """
    if not text: return "Empty", 0.0

    # 1. Tokenize
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # 2. Move to GPU/CPU
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # 3. Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
    
    # 4. Decode Result
    confidence, prediction_idx = torch.max(probs, dim=1)
    predicted_label = classes[prediction_idx.item()]
    confidence_score = confidence.item()

    return predicted_label, confidence_score

In [40]:
# --- LIST OF TEST CASES ---
test_cases = [
    # --- INTERNSHIP EXAMPLES ---
    "I spent my summer assisting the backend team with API documentation and minor bug fixes.",
    "Shadowed the Senior UX Designer and helped conduct user research surveys for 3 months.",
    "Part of the university co-op program, working on data entry and basic SQL queries.",
    
    # --- FREELANCE EXAMPLES ---
    "I take on various graphic design projects on Upwork, managing my own schedule and clients.",

    
    # --- FULL-TIME EXAMPLES ---
    "Served as the Lead Developer for 4 years, managing a team of 10 engineers.",
    "Responsible for the end-to-end deployment of the company's main payment gateway.",
    "Employed since 2018 as a Senior Analyst, handling daily operations and quarterly reporting.",
    
    # --- TRICKY / AMBIGUOUS EXAMPLES (The real test!) ---
    "I worked for 6 months covering a maternity leave, handling full server access.", 
    # ^ (Could be Full-time or Contract/Freelance)
    
    "I built the entire mobile app by myself during the weekends while studying.", 
    # ^ (Likely Freelance or Personal Project, shouldn't be Full-time)
    
    "Junior developer responsible for updating the UI, reporting to the CTO." 
    # ^ (Context implies Full-time, but description is simple)
]

# --- RUN BATCH PREDICTION ---
print(f"{'PREDICTION':<15} | {'CONF.':<8} | {'RESUME TEXT'}")
print("="*90)

for text in test_cases:
    label, score = predict_resume(text)
    
    # Color coding for easier reading (optional)
    # 90%+ confidence = High, <70% = Low
    indicator = "‚úÖ" if score > 0.9 else "‚ö†Ô∏è"
    
    print(f"{label.upper():<15} | {score*100:.1f}% {indicator} | \"{text[:60]}...\"")

PREDICTION      | CONF.    | RESUME TEXT
FULL-TIME       | 100.0% ‚úÖ | "I spent my summer assisting the backend team with API docume..."
FULL-TIME       | 80.5% ‚ö†Ô∏è | "Shadowed the Senior UX Designer and helped conduct user rese..."
INTERNSHIP      | 100.0% ‚úÖ | "Part of the university co-op program, working on data entry ..."
FULL-TIME       | 100.0% ‚úÖ | "I take on various graphic design projects on Upwork, managin..."
FULL-TIME       | 100.0% ‚úÖ | "Served as the Lead Developer for 4 years, managing a team of..."
FULL-TIME       | 100.0% ‚úÖ | "Responsible for the end-to-end deployment of the company's m..."
FULL-TIME       | 99.9% ‚úÖ | "Employed since 2018 as a Senior Analyst, handling daily oper..."
FULL-TIME       | 90.2% ‚úÖ | "I worked for 6 months covering a maternity leave, handling f..."
FULL-TIME       | 93.5% ‚úÖ | "I built the entire mobile app by myself during the weekends ..."
FULL-TIME       | 100.0% ‚úÖ | "Junior developer responsible for updating the UI, repor