<a href="https://colab.research.google.com/github/Andrew7101/Glassdoor_Review_Rating_Prediction/blob/main/Bert_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enhanced Job Rating Predictor using BERT Regressor (GPU-only Implementation)
"""

# Data manipulation
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Text preprocessing
import re
import contractions
from textblob import TextBlob

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Transformers for BERT
from transformers import BertTokenizer, BertModel, logging

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

# Disable unnecessary warnings from transformers
logging.set_verbosity_error()

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def advanced_text_preprocessing(text):
    """Enhanced text preprocessing with sentiment analysis and advanced features"""
    text = str(text).lower()

    # Basic cleaning
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z\s\.,!?]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    # Create TextBlob object
    blob = TextBlob(text)

    # Extract features
    features = {
        'text_length': len(text),
        'word_count': len(text.split()),
        'sentiment_polarity': blob.sentiment.polarity,
        'sentiment_subjectivity': blob.sentiment.subjectivity,
        'exclamation_count': text.count('!'),
        'question_count': text.count('?'),
        'avg_word_length': np.mean([len(word) for word in text.split()]) if text else 0
    }

    return text, features

class JobDataset(Dataset):
    def __init__(self, texts, numerical_features, targets=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.numerical_features = numerical_features  # Should be a PyTorch tensor on GPU
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Move inputs to GPU
        input_ids = inputs['input_ids'].squeeze(0).to(device)
        attention_mask = inputs['attention_mask'].squeeze(0).to(device)

        numerical_features = self.numerical_features[idx]

        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'numerical_features': numerical_features
        }

        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.float32, device=device)

        return item

class BertRegressor(nn.Module):
    def __init__(self, n_numerical_features):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.numerical_layer = nn.Linear(n_numerical_features, 128)
        self.out = nn.Linear(self.bert.config.hidden_size + 128, 1)

    def forward(self, input_ids, attention_mask, numerical_features):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.pooler_output  # [batch_size, hidden_size]

        numerical_output = self.numerical_layer(numerical_features)
        numerical_output = torch.relu(numerical_output)

        combined = torch.cat((cls_output, numerical_output), dim=1)
        combined = self.drop(combined)
        output = self.out(combined)
        return output

def save_model_and_data(model, tokenizer, scaler, label_encoder_job_title, file_path):
    # Save the PyTorch model
    model_save_path = f"{file_path}_model.pth"
    torch.save(model.state_dict(), model_save_path)

    # Save the tokenizer, scaler, and label encoder using joblib
    data_save_path = f"{file_path}_data.pt"
    torch.save({
        'tokenizer': tokenizer,
        'scaler_mean': scaler.mean_,
        'scaler_scale': scaler.scale_,
        'label_encoder_classes': label_encoder_job_title.classes_
    }, data_save_path)

    print(f"Model saved to {model_save_path}")
    print(f"Data saved to {data_save_path}")

def load_model_and_data(file_path, model_class, n_numerical_features, device):
    # Load the PyTorch model
    model = model_class(n_numerical_features=n_numerical_features)
    model_load_path = f"{file_path}_model.pth"
    model.load_state_dict(torch.load(model_load_path, map_location=device))
    model = model.to(device)
    model.eval()

    # Load the tokenizer, scaler, and label encoder
    data_load_path = f"{file_path}_data.pt"
    data = torch.load(data_load_path, map_location='cpu')

    # Reconstruct tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Reconstruct scaler parameters
    scaler_mean = torch.tensor(data['scaler_mean'], device=device, dtype=torch.float32)
    scaler_scale = torch.tensor(data['scaler_scale'], device=device, dtype=torch.float32)

    # Reconstruct label encoder
    label_encoder_job_title = LabelEncoder()
    label_encoder_job_title.classes_ = data['label_encoder_classes']

    print(f"Model loaded from {model_load_path}")
    print(f"Data loaded from {data_load_path}")
    return model, tokenizer, scaler_mean, scaler_scale, label_encoder_job_title

def train_model():
    # Load and preprocess data
    print("Loading data...")
    small = pd.read_csv('/content/drive/MyDrive/dataset/424_F2024_Final_PC_small_train_v1.csv')
    large = pd.read_csv('/content/drive/MyDrive/dataset/424_F2024_Final_PC_large_train_v1.csv')
    data = pd.concat([small, large], ignore_index=True)

    # Fill missing values
    print("Filling missing values...")
    data['headline'].fillna('', inplace=True)
    data['pros'].fillna('', inplace=True)
    data['cons'].fillna('', inplace=True)
    data['job_title'].fillna('Unknown', inplace=True)

    # Combine text columns
    print("Combining text columns...")
    data['combined_text'] = data['headline'] + ' ' + data['pros'] + ' ' + data['cons']

    # Process text and extract features
    print("Extracting advanced features...")
    processed_texts = []
    feature_dicts = []

    for text in tqdm(data['combined_text']):
        processed_text, features = advanced_text_preprocessing(text)
        processed_texts.append(processed_text)
        feature_dicts.append(features)

    data['processed_text'] = processed_texts

    # Create feature columns
    for feature_name in feature_dicts[0].keys():
        data[f'{feature_name}'] = [d[feature_name] for d in feature_dicts]

    # Encode job titles
    print("Encoding job titles...")
    label_encoder_job_title = LabelEncoder()
    data['job_title_encoded'] = label_encoder_job_title.fit_transform(data['job_title'])

    # Prepare numerical features
    print("Preparing numerical features...")
    numerical_features = [
        'text_length', 'word_count', 'sentiment_polarity', 'sentiment_subjectivity',
        'exclamation_count', 'question_count', 'avg_word_length', 'job_title_encoded'
    ]

    X_numerical = data[numerical_features].values

    # Scale numerical features using StandardScaler
    print("Scaling numerical features...")
    scaler = StandardScaler()
    X_numerical = scaler.fit_transform(X_numerical)

    # Convert to PyTorch tensor on GPU
    X_numerical = torch.tensor(X_numerical, device=device, dtype=torch.float32)

    # Targets
    y = data['rating'].values
    y = torch.tensor(y, device=device, dtype=torch.float32)

    # Tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Split data
    print("Splitting data...")
    X_train_texts, X_val_texts, X_train_numerical, X_val_numerical, y_train, y_val = train_test_split(
        data['processed_text'], X_numerical, y, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = JobDataset(
        texts=X_train_texts.values,
        numerical_features=X_train_numerical,
        targets=y_train,
        tokenizer=tokenizer
    )

    val_dataset = JobDataset(
        texts=X_val_texts.values,
        numerical_features=X_val_numerical,
        targets=y_val,
        tokenizer=tokenizer
    )

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Model
    model = BertRegressor(n_numerical_features=X_numerical.shape[1])
    model = model.to(device)

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Training loop
    epochs = 3
    best_mse = float('inf')

    print("Starting training...")
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            numerical_features = batch['numerical_features']
            targets = batch['targets'].unsqueeze(1)

            outputs = model(input_ids, attention_mask, numerical_features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        preds = []
        actuals = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                numerical_features = batch['numerical_features']
                targets = batch['targets'].unsqueeze(1)

                outputs = model(input_ids, attention_mask, numerical_features)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                preds.append(outputs)
                actuals.append(targets)

        avg_val_loss = val_loss / len(val_loader)
        preds = torch.cat(preds, dim=0).cpu()
        actuals = torch.cat(actuals, dim=0).cpu()

        mse = nn.functional.mse_loss(preds, actuals).item()
        r2 = 1 - mse / torch.var(actuals).item()

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation MSE: {mse:.4f}")
        print(f"Validation R^2 Score: {r2:.4f}")

        # Save the best model
        if mse < best_mse:
            best_mse = mse
            torch.save(model.state_dict(), 'best_bert_regressor.pth')
            save_model_and_data(model, tokenizer, scaler, label_encoder_job_title, '/content/drive/MyDrive/dataset/bert_regressor')

    print("Training complete.")
    return model, tokenizer, scaler, label_encoder_job_title

def predict_new_data(model, tokenizer, scaler_mean, scaler_scale, label_encoder_job_title):
    print("Processing new test data...")
    new_test_data = pd.read_csv('/content/drive/MyDrive/dataset/424_F2024_Final_PC_test_without_response_v1.csv')

    # Apply same preprocessing to test data
    new_test_data['headline'].fillna('', inplace=True)
    new_test_data['pros'].fillna('', inplace=True)
    new_test_data['cons'].fillna('', inplace=True)
    new_test_data['job_title'].fillna('Unknown', inplace=True)

    # Combine text columns
    new_test_data['combined_text'] = (
        new_test_data['headline'] + ' ' +
        new_test_data['pros'] + ' ' +
        new_test_data['cons']
    )

    # Process text and extract features
    print("Extracting advanced features from test data...")
    processed_texts = []
    feature_dicts = []

    for text in tqdm(new_test_data['combined_text']):
        processed_text, features = advanced_text_preprocessing(text)
        processed_texts.append(processed_text)
        feature_dicts.append(features)

    new_test_data['processed_text'] = processed_texts

    # Create feature columns
    for feature_name in feature_dicts[0].keys():
        new_test_data[f'{feature_name}'] = [d[feature_name] for d in feature_dicts]

    # Implement label encoding using PyTorch
    print("Encoding job titles...")
    job_title_mapping = {title: idx for idx, title in enumerate(label_encoder_job_title.classes_)}
    unknown_label = len(job_title_mapping)

    encoded_job_titles = []
    for job_title in new_test_data['job_title']:
        idx = job_title_mapping.get(job_title, unknown_label)
        encoded_job_titles.append(idx)

    # Convert to PyTorch tensor on GPU
    encoded_job_titles = torch.tensor(encoded_job_titles, device=device, dtype=torch.float32).unsqueeze(1)

    # Prepare numerical features
    numerical_features = [
        'text_length', 'word_count', 'sentiment_polarity', 'sentiment_subjectivity',
        'exclamation_count', 'question_count', 'avg_word_length'
    ]

    numerical_features_list = []

    for feature_name in numerical_features:
        feature_values = new_test_data[feature_name].values
        feature_tensor = torch.tensor(feature_values, device=device, dtype=torch.float32).unsqueeze(1)
        numerical_features_list.append(feature_tensor)

    # Add encoded job titles
    numerical_features_list.append(encoded_job_titles)

    # Concatenate all numerical features
    X_numerical = torch.cat(numerical_features_list, dim=1)

    # Manually scale numerical features using PyTorch
    scaler_mean = scaler_mean.to(device)
    scaler_scale = scaler_scale.to(device)
    X_numerical = (X_numerical - scaler_mean) / scaler_scale

    # Create dataset
    test_dataset = JobDataset(
        texts=new_test_data['processed_text'].values,
        numerical_features=X_numerical,
        tokenizer=tokenizer
    )

    # Data loader
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Prediction
    print("Predicting on new test data...")
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            numerical_features = batch['numerical_features']

            outputs = model(input_ids, attention_mask, numerical_features)
            preds.append(outputs)

    # Concatenate all predictions
    preds = torch.cat(preds, dim=0)

    # Move predictions to CPU and convert to NumPy for saving
    preds_np = preds.cpu().numpy()

    # Save predictions
    output = pd.DataFrame(preds_np, columns=['prediction'])
    output.to_csv('/content/drive/MyDrive/dataset/predicted_ratings_bert_regressor.csv', index=False)

    print("Predictions saved to 'predicted_ratings_bert_regressor.csv'")



Using device: cuda


In [None]:

model, tokenizer, numerical_features, scaler, label_encoder_job_title = train_model()




Loading data...
Filling missing values...
Combining text columns...
Extracting advanced features...


100%|██████████| 600000/600000 [04:24<00:00, 2271.93it/s]


Encoding job titles...
Preparing numerical features...
Scaling numerical features...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Splitting data...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...


100%|██████████| 30000/30000 [49:39<00:00, 10.07it/s]


Epoch 1/3
Training Loss: 0.5633
Validation Loss: 0.5013
Validation MSE: 0.5013
Validation R^2 Score: 0.5920


100%|██████████| 30000/30000 [49:48<00:00, 10.04it/s]


Epoch 2/3
Training Loss: 0.4847
Validation Loss: 0.5043
Validation MSE: 0.5043
Validation R^2 Score: 0.5895


100%|██████████| 30000/30000 [49:49<00:00, 10.04it/s]


Epoch 3/3
Training Loss: 0.4311
Validation Loss: 0.5048
Validation MSE: 0.5048
Validation R^2 Score: 0.5891
Training complete.


In [None]:
# Load the best model
model.load_state_dict(torch.load('best_bert_regressor.pth'))

<All keys matched successfully>

In [None]:
# Evaluate model on full data
evaluate_model(model, tokenizer, numerical_features, scaler, label_encoder_job_title)

Loading data...
Filling missing values...
Combining text columns...
Extracting advanced features...


100%|██████████| 600000/600000 [04:25<00:00, 2260.93it/s]


Encoding job titles...
Preparing numerical features...
Scaling numerical features...


100%|██████████| 37500/37500 [27:30<00:00, 22.72it/s]


BERT Regressor Model Performance on Full Data:
Mean Squared Error: 0.38092825
R^2 Score: 0.6905105305235987


In [None]:
scaler_mean = torch.tensor(scaler.mean_, device=device, dtype=torch.float32)
scaler_scale = torch.tensor(scaler.scale_, device=device, dtype=torch.float32)

# Predict on new test data
predict_new_data(
    model=model,
    tokenizer=tokenizer,
    scaler_mean=scaler_mean,
    scaler_scale=scaler_scale,
    label_encoder_job_title=label_encoder_job_title
)

Processing new test data...
Extracting advanced features from test data...


100%|██████████| 100000/100000 [01:07<00:00, 1471.47it/s]


Encoding job titles...
Predicting on new test data...


100%|██████████| 6250/6250 [05:43<00:00, 18.20it/s]


Predictions saved to 'predicted_ratings_bert_regressor.csv'


In [None]:
import torch
from joblib import dump

def save_model_and_data(model, tokenizer, scaler, label_encoder_job_title, file_path):
    # Save the PyTorch model
    model_save_path = f"{file_path}_model.pth"
    torch.save(model.state_dict(), model_save_path)

    # Save the tokenizer, scaler, and label encoder using joblib
    data_save_path = f"{file_path}_data.joblib"
    dump({
        'tokenizer': tokenizer,
        'scaler': scaler,
        'label_encoder_job_title': label_encoder_job_title
    }, data_save_path)

    print(f"Model saved to {model_save_path}")
    print(f"Data saved to {data_save_path}")



save_model_and_data(model, tokenizer, scaler, label_encoder_job_title, "/content/drive/MyDrive/dataset/bert_regressor")


Model saved to /content/drive/MyDrive/dataset/bert_regressor_model.pth
Data saved to /content/drive/MyDrive/dataset/bert_regressor_data.joblib


In [None]:
from google.colab import runtime
runtime.unassign()
