### MODEL BUILDING

In [3]:
!pip install transformers xgboost



In [7]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch

# Load and sample dataset
df = pd.read_csv('/content/sample_file.csv')
df_sample = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Combine columns and define target
df_sample['Combined_Text'] = df_sample['Job Description '] + " " + df_sample['Resume']
y = df_sample['Resume Score']

In [9]:
# Initialize DistilBERT model and tokenizer
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
import torch

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Set device to CPU
device = torch.device('cpu')
model = model.to(device)

# Define function to get DistilBERT embeddings for a batch of texts
def get_distilbert_embeddings_batch(texts, max_length=256):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=max_length).to(device)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()  # Mean pooling for embeddings

# Process texts in batches (batch size of 8)
batch_size = 8
bert_embeddings = np.vstack([get_distilbert_embeddings_batch(df_sample['Combined_Text'][i:i+batch_size].tolist())
                             for i in range(0, len(df_sample), batch_size)])

print("Generated embeddings shape:", bert_embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Generated embeddings shape: (5000, 768)


In [10]:
# Import necessary libraries for modeling
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define the target variable
y = df_sample['Resume Score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions and evaluate Gradient Boosting model
gb_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_pred, squared=False)
gb_r2 = r2_score(y_test, gb_pred)
print("Gradient Boosting Model Performance:")
print(f"RMSE: {gb_rmse:.2f}")
print(f"R^2 Score: {gb_r2:.2f}")

# Initialize XGBoost Regressor and define parameter grid for tuning
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    n_iter=10,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)
best_xgb_model = random_search.best_estimator_

# Make predictions and evaluate the best XGBoost model
xgb_pred = best_xgb_model.predict(X_test)
xgb_rmse = mean_squared_error(y_test, xgb_pred, squared=False)
xgb_r2 = r2_score(y_test, xgb_pred)
print("\nXGBoost Model Performance (with tuning):")
print(f"RMSE: {xgb_rmse:.2f}")
print(f"R^2 Score: {xgb_r2:.2f}")

Gradient Boosting Model Performance:
RMSE: 10.15
R^2 Score: 0.42





XGBoost Model Performance (with tuning):
RMSE: 10.06
R^2 Score: 0.43


