# Purpose

Job titles come in many different forms but can often be representing basic types of jobs. We need the ability to reduce the noise and associate various job titles with discrete job categories. For instance a Product Manager job could be written as:
- Product Manager
- Sr Product Manager
- AI Product Manager
- Product Manager - Platform
- etc....

We will fine tune a Distilbert Model so that it can predict the job title category based on the text written in the job description. We will save the fine tuned model and then call if from other Python Modules in the future. 

## Information about DistilBert


### References:
- https://www.youtube.com/watch?v=ZvsH09XGuZ0



-----


### Step One: Ensure all necessary libraries have been installed.

In [None]:
!pip install scikit-learn==1.5.0
!pip install accelerate transformers[torch]
!pip install datasets boto3 python-dotenv joblib requests
!pip install psycopg2-binary
!pip install pandas apify-client nomic openai pinecone-client
!pip install --no-deps OperationBattleshipCommonUtilities

### Step 2: Import all necessary libraries

In [None]:
import logging
import os
import sys
import requests
import time
import nltk
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from joblib import dump, load

from operation_battleship_common_utilities.JobPostingDao import JobPostingDao
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset
import boto3
import json
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

### Step 4: Load the environment variables. 

- We can load the .env file if running locally.
- When running on Paperspace, I hit barriers with importing the .env file so I elected to manually set them

In [None]:
# DigitalOcean configuration
os.environ['digitalOcean'] = 'update_value'
os.environ['password'] = 'update_value'
os.environ['host'] = 'update_value'
os.environ['port'] = 'update_value'
os.environ['database'] = 'update_value'
os.environ['sslmode'] = 'update_value'

# Logging configuration
os.environ['LOG_LEVEL'] = 'update_value'
os.environ['LOG_FILE'] = 'update_value'

# DigitalOcean Spaces credentials
os.environ['DO_ACCESS_KEY'] = 'update_value'
os.environ['DO_SECRET_KEY'] = 'update_value'
os.environ['DO_REGION'] = 'update_value'
os.environ['DO_BUCKET_NAME'] = 'update_value'

os.environ['WANDB_DISABLED'] = 'update_value'

### Step 5 Define Helper Functions

In [None]:
nltk.download('stopwords')
stopw = stopwords.words('english')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_jobs_with_required_columns():  ##
    print("Fetching jobs with required columns")

    job_posting_dao = JobPostingDao()
    all_jobs_df = job_posting_dao.getAllJobs()

    columns_to_keep = [
        'job_posting_id',
        'job_title',
        'full_posting_description',
        'job_category'
    ]

    all_jobs_df = all_jobs_df[columns_to_keep]

    ordered_columns = [
        'job_posting_id',
        'job_category',
        'job_title',
        'full_posting_description'
    ]

    all_jobs_df = all_jobs_df[ordered_columns]

    all_jobs_df = all_jobs_df.sample(frac=1).reset_index(drop=True) # Randomly shuffle the records in the DF

    return all_jobs_df

def remove_empty_and_short_rows(df): ##
    print("Removing data for inference")
    df = df.dropna(subset=['job_category'])
    df = df[df['job_category'] != 'Unknown']

    df = df.dropna(subset=['full_posting_description'])
    df = df[df['full_posting_description'].str.strip() != '']

    df = df.dropna(subset=['full_posting_description'])
    df['word_count'] = df['full_posting_description'].apply(lambda x: len(str(x).split()))
    df = df[df['word_count'] >= 80]

    df = df.reset_index(drop=True)
    return df

def encodeJobCategories(df, le): ##
    df['encoded_categories'] = le.transform(df['job_category'])
    return df


def clean_text(text): ##
    
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

def upload_to_s3(file_path, bucket_name, object_name):
    s3_client = boto3.client('s3',
                             region_name= os.environ['DO_REGION'],
                             aws_access_key_id= os.environ['DO_ACCESS_KEY'],
                             aws_secret_access_key= os.environ['DO_SECRET_KEY'])
    try:
        s3_client.upload_file(file_path, os.environ['DO_BUCKET_NAME'], object_name)
        print(f"File {file_path} uploaded to {bucket_name}/{object_name}")
    except Exception as e:
        print(f"Failed to upload {file_path} to S3: {e}")

def download_from_s3(url, destination):
    try:
        response = requests.get(url)
        with open(destination, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded from {url} to {destination}")
    except Exception as e:
        print(f"Failed to download file from {url}: {e}")

### Step 6: Define the functions for measuring and benchmarking the trained model. 

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

def evaluate_model(trainer, test_dataset, label_encoder):
    # Get predictions
    preds_output = trainer.predict(test_dataset)
    predictions = np.argmax(preds_output.predictions, axis=1)
    true_labels = preds_output.label_ids

    # Accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Precision, Recall, F1-Score
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

    # Confusion Matrix
    conf_matrix = confusion_matrix(true_labels, predictions)

    # Classification Report
    class_report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    # Print metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("\nClassification Report:")
    print(class_report)

    # Plot Confusion Matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    # Training and Validation Loss Curves
    training_logs = trainer.state.log_history

    train_loss = [x['loss'] for x in training_logs if 'loss' in x.keys()]
    eval_loss = [x['eval_loss'] for x in training_logs if 'eval_loss' in x.keys()]

    plt.figure(figsize=(10, 8))
    plt.plot(train_loss, label='Training Loss')
    plt.plot(eval_loss, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Curves')
    plt.legend()
    plt.show()
    
    return

### Step 7: Train the model. 

1. Get the list of jobs with labels
2. Encode the job_category
3. Remove stop words and unnecessary characters from the Job Title
4. Define the model
5. Train the model
6. Benchmark the model
7. Persist the model to Digital Ocean S3 bucket

In [None]:
def main():
    try:
        start_time = time.time()  # Start timer

        # Get the jobs from the DB and drop all unnecessary columns
        jobsDf = get_jobs_with_required_columns()
        if jobsDf is None or jobsDf.empty:
            print("No job data retrieved or DataFrame is empty")
            return

        jobsDf = remove_empty_and_short_rows(jobsDf)
        if jobsDf is None or jobsDf.empty:
            print("No job data")
            return

        # Download and load the label encoder
        label_encoder_path = 'label_encoder.joblib'
        download_from_s3("https://operationbattleship-resumes.nyc3.cdn.digitaloceanspaces.com/label_encoder.joblib", label_encoder_path)
        label_encoder = load(label_encoder_path)
        print("Label encoder loaded.")

        jobsDf = encodeJobCategories(jobsDf, label_encoder)
        if jobsDf is None or jobsDf.empty:
            print("No job data after category encoding")
            return

        print(f"Data Prep Complete")
        print(f"Number of rows in jobsDf: {jobsDf.shape[0]}")

        jobsDf = jobsDf.reset_index(drop=True)

        jobsDf['job_title'] = jobsDf['job_title'].apply(clean_text)

        data_texts = jobsDf["job_title"].tolist()
        data_labels = jobsDf["encoded_categories"].tolist()

        print(f"Type of object for data_texts: {type(data_texts)}")
        print(f"Type of object for data_labels: {type(data_labels)}")

        # Tokenize data using DistilBERT tokenizer
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        encodings = tokenizer(data_texts, truncation=True, padding=True, max_length=512)
        
        # Create a Dataset from the tokenized inputs and labels
        dataset = Dataset.from_dict({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': data_labels
        })

        train_testvalid = dataset.train_test_split(test_size=0.2)
        train_testvalid['validation'] = train_testvalid.pop('test').train_test_split(test_size=0.5)

        train_dataset = train_testvalid['train']
        val_dataset = train_testvalid['validation']['test']
        test_dataset = train_testvalid['validation']['train']

        # Define model
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(np.unique(data_labels)))

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=5,  # Increased number of epochs for potentially better performance
            per_device_train_batch_size=8,  # Adjust based on your GPU memory
            per_device_eval_batch_size=8,  # Adjust based on your GPU memory
            warmup_steps=100,  # Adjusted for a shorter warm-up period
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=50,  # Adjust logging steps to reduce overhead
            evaluation_strategy="steps",  # Change to "steps" if you want more frequent evaluations
            eval_steps=500,  # Evaluate every 500 steps
            save_strategy="steps",  # Save model more frequently
            save_steps=500,  # Save every 500 steps
            save_total_limit=2,  # Keep only the last 2 models to save disk space
            load_best_model_at_end=True,
            report_to="none",
            fp16=True,  # Enable mixed precision training
            gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch size
            learning_rate=5e-5,  # Explicitly set learning rate
            logging_first_step=True,  # Log the first step to monitor initial progress
            metric_for_best_model="accuracy",  # Assuming accuracy is your key metric
        )

        data_collator = DataCollatorWithPadding(tokenizer)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics  # Pass the compute_metrics function
        )

        # Train the model
        trainer.train()

        # Save the model locally
        model_path = 'distilbert_model'
        trainer.save_model(model_path)
        tokenizer.save_pretrained(model_path)

        # Upload the model to S3
        for root, _, files in os.walk(model_path):
            for file in files:
                upload_to_s3(os.path.join(root, file), os.environ['DO_BUCKET_NAME'], os.path.relpath(os.path.join(root, file), model_path))

        print(f"Model uploaded to S3 successfully.")

        # Evaluate the model
        metrics = trainer.evaluate(eval_dataset=test_dataset)
        with open('results/distilbert_metrics.json', 'w') as f:
            json.dump(metrics, f)
        upload_to_s3('results/distilbert_metrics.json', os.environ['DO_BUCKET_NAME'], 'results/distilbert_metrics.json')
        
        evaluate_model(trainer, test_dataset, label_encoder)

        end_time = time.time()  # End timer
        duration = (end_time - start_time) / 60  # Duration in minutes

        print(f"Script complete. Time taken to complete: {duration} minutes")

    except Exception as e:
        print(f"An error occurred: {e}")

    return