<a href="https://colab.research.google.com/github/BriniMohamedAyechi/Resumes_Classification_BERT/blob/main/Resumes_Classification_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install -U scikit-learn
!pip install PyPDF2


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.2 MB/s[0m eta [36m0:00:0

In [None]:
import torch

# Clear GPU cache
torch.cuda.empty_cache()


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch.nn.functional as F

# Data Preprocessing
# Load your labeled PDF dataset into a DataFrame
# Replace 'Content' and 'is_valid' with the correct column names from your dataset
df = pd.read_csv('/content/pdf_full_DataSet.csv')

# Map label values to integers
label_map = {'Valid': 1, 'Invalid': 0}
df['is_valid'] = df['is_valid'].map(label_map)


# Load Pretrained BERT Model
# Load a pretrained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenization
# Tokenize the PDF content and create input IDs, attention masks, and token type IDs
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df, max_length=512):
        self.tokenizer = tokenizer
        self.data = df
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx >= len(self.data):
            raise IndexError(f"Index out of range: {idx}")

        content = str(self.data.get('Content', '').iloc[idx])  # Use .get() with default empty string ''
        label = self.data.get('is_valid', 0).iloc[idx]  # Use .get() with default 0 for label

        if pd.isnull(label):  # Check if the label is NaN
            label = 0  # Replace NaN with a default value (e.g., 0)

        label = int(label)  # Convert the label to an integer

        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        token_type_ids = encoding['token_type_ids'].flatten()

        # Convert the label to one-hot encoding
        label = F.one_hot(torch.tensor(label), num_classes=2).float()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'label': label
        }


# Data Encoding
# Split the DataFrame into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(tokenizer, train_df)
val_dataset = CustomDataset(tokenizer, val_df)
test_dataset = CustomDataset(tokenizer, test_df)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


#Fine-Tuning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 6
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Step 8: Validation and Testing
    model.eval()
    with torch.no_grad():
        # Validation
        val_preds = []
        val_labels = []
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            val_preds.extend(predicted_labels.cpu().numpy())
            val_labels.extend(labels.argmax(dim=1).cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_preds)
        print(f'Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}')

        # Testing
        test_preds = []
        test_labels = []
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

            test_preds.extend(predicted_labels.cpu().numpy())
            test_labels.extend(labels.argmax(dim=1).cpu().numpy())

        test_accuracy = accuracy_score(test_labels, test_preds)
        print(f'Test Accuracy: {test_accuracy:.4f}')



# Step 9: Evaluate Results
print(classification_report(test_labels, test_preds))

# Save the model and tokenizer for later use
os.makedirs('fine_tuned_bert_model', exist_ok=True)
model.save_pretrained('fine_tuned_bert_model')
tokenizer.save_pretrained('fine_tuned_bert_model')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/6, Validation Accuracy: 0.7875




Test Accuracy: 0.8123




Epoch 2/6, Validation Accuracy: 0.8425




Test Accuracy: 0.8270




Epoch 3/6, Validation Accuracy: 0.8571




Test Accuracy: 0.8739




Epoch 4/6, Validation Accuracy: 0.8608




Test Accuracy: 0.8768




Epoch 5/6, Validation Accuracy: 0.8498




Test Accuracy: 0.8710




Epoch 6/6, Validation Accuracy: 0.8498




Test Accuracy: 0.8622
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       156
           1       0.91      0.83      0.87       185

    accuracy                           0.86       341
   macro avg       0.86      0.87      0.86       341
weighted avg       0.87      0.86      0.86       341



('fine_tuned_bert_model/tokenizer_config.json',
 'fine_tuned_bert_model/special_tokens_map.json',
 'fine_tuned_bert_model/vocab.txt',
 'fine_tuned_bert_model/added_tokens.json')

In [None]:
print(len(df))
print(df.index)


1702
RangeIndex(start=0, stop=1702, step=1)


In [None]:
print(df.columns)


Index(['PDF File', 'is_valid', 'Content'], dtype='object')


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from PyPDF2 import PdfReader

# Load the fine-tuned model and tokenizer
model_name = 'fine_tuned_bert_model'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

def read_pdf_content(file_path):
    pdf_reader = PdfReader(file_path)
    pdf_text = ""
    for page in pdf_reader.pages:
        pdf_text += page.extract_text()
    return pdf_text

def evaluate_pdf(model, tokenizer, pdf_content):
    inputs = tokenizer.encode_plus(
        pdf_content,
        add_special_tokens=True,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Ensure the model is in evaluation mode
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # In your case, 'predicted_class' will indicate whether the PDF is valid (1) or invalid (0)
    return predicted_class

# List of PDF file paths to evaluate
pdf_file_paths = ['/content/T1.pdf','/content/T2.pdf','/content/T3.pdf','/content/F1.pdf','/content/F2.pdf'
,'/content/F3.pdf','/content/F4.pdf','/content/T4.pdf']

for pdf_file_path in pdf_file_paths:
    pdf_content = read_pdf_content(pdf_file_path)
    predicted_class = evaluate_pdf(model, tokenizer, pdf_content)
    print(f"Predicted class for '{pdf_file_path}': {'Valid' if predicted_class == 1 else 'Invalid'}")


Predicted class for '/content/T1.pdf': Valid
Predicted class for '/content/T2.pdf': Valid
Predicted class for '/content/T3.pdf': Valid
Predicted class for '/content/F1.pdf': Valid
Predicted class for '/content/F2.pdf': Invalid
Predicted class for '/content/F3.pdf': Invalid
Predicted class for '/content/F4.pdf': Invalid
Predicted class for '/content/T4.pdf': Invalid
