**MODEL BUILDING**

In [None]:
!pip install transformers xgboost



In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch

# Load and sample dataset
df = pd.read_csv('/content/Final_dataset.csv')
df_sample = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Combine columns and define target
df_sample['Combined_Text'] = df_sample['Job Description'] + " " + df_sample['Resume']
y = df_sample['Resume Score']

In [None]:
# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()

# Apply BERT embeddings to combined text
bert_embeddings = np.array([get_bert_embeddings(text) for text in df_sample['Combined_Text']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Train Gradient Boosting model
gb_model.fit(X_train, y_train)

# Train XGBoost model
xgb_model.fit(X_train, y_train)

In [None]:
# Predictions from each model
gb_pred = gb_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

# Ensemble predictions by averaging
ensemble_pred = (gb_pred + xgb_pred) / 2

# Evaluate the models
rmse_gb = mean_squared_error(y_test, gb_pred, squared=False)
r2_gb = r2_score(y_test, gb_pred)

rmse_xgb = mean_squared_error(y_test, xgb_pred, squared=False)
r2_xgb = r2_score(y_test, xgb_pred)

rmse_ensemble = mean_squared_error(y_test, ensemble_pred, squared=False)
r2_ensemble = r2_score(y_test, ensemble_pred)

# Print results
print("Gradient Boosting RMSE:", rmse_gb)
print("Gradient Boosting R^2:", r2_gb)
print("XGBoost RMSE:", rmse_xgb)
print("XGBoost R^2:", r2_xgb)
print("Ensemble Model RMSE:", rmse_ensemble)
print("Ensemble Model R^2:", r2_ensemble)

Gradient Boosting RMSE: 10.160649251811193
Gradient Boosting R^2: 0.4205999681229088
XGBoost RMSE: 10.767834822615253
XGBoost R^2: 0.3492826819419861
Ensemble Model RMSE: 10.237495817089096
Ensemble Model R^2: 0.4118026411739677


