In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import (BertTokenizer, BertForSequenceClassification,
                          AutoTokenizer, AutoModelForSequenceClassification,
                          XLMRobertaTokenizer, XLMRobertaForSequenceClassification,
                          Trainer, TrainingArguments)
from torch.utils.data import Dataset
import torch
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
# Function to preprocess text
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.strip()  # Remove leading and trailing spaces
    text = text.lower()  # Convert to lowercase

    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Apply lemmatization
    return ' '.join(words)
stop_words = set(stopwords.words('english'))
# Load training and test data
train_data = pd.read_excel('train_data.xlsx')
test_data = pd.read_excel('test_data.xlsx')

# Apply preprocessing to training and test data
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)
validation_size = 0.1

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['cleaned_text'], train_data['target_col'],
    test_size=validation_size,
    random_state=42
)

# Prepare text and labels for train, validation, and test
train_texts = X_train.tolist()
train_labels = y_train.tolist()
val_texts = X_val.tolist()
val_labels = y_val.tolist()
test_texts = test_data['cleaned_text'].tolist()
test_labels = test_data['target_col'].tolist()

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Define tokenizers and models
tokenizer_dict = {
    'bert': BertTokenizer.from_pretrained('bert-base-uncased'),
    'scibert': AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased'),
   'xlm-roberta': XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
}

models = {
    'bert': (BertForSequenceClassification, 'bert-base-uncased'),
    'scibert': (AutoModelForSequenceClassification, 'allenai/scibert_scivocab_uncased'),
   'xlm-roberta': (XLMRobertaForSequenceClassification, 'xlm-roberta-base')
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [3]:
def tokenize_function(texts, tokenizer):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

# Dataset class for handling tokenized data
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
def evaluate_model(model_class, tokenizer, test_texts, test_labels, model_dir):
    # Load the model
    model = model_class.from_pretrained(model_dir)

    # Tokenize test texts
    test_encodings = tokenize_function(test_texts, tokenizer)

    # Prepare test dataset
    test_dataset = TextDataset(test_encodings, test_labels)

    # Create Trainer instance for evaluation
    training_args = TrainingArguments(
        output_dir=model_dir,
        per_device_eval_batch_size=16,
        logging_dir='./logs',
        gradient_checkpointing=True,
        fp16=True,
        evaluation_strategy="no",  # No further evaluation during prediction
    )

    trainer = Trainer(
        model=model,
        args=training_args
    )

    # Predict on test set using the trained model
    predictions = trainer.predict(test_dataset)
    y_test_pred = np.argmax(predictions.predictions, axis=1)

    # Calculate test F1 score and return
    test_f1 = f1_score(test_labels, y_test_pred, average='weighted')
    return test_f1, y_test_pred

In [5]:
results = []

for model_name, (model_class, pretrained_model) in models.items():
    print(f"Training and evaluating {model_name}...")

    # Tokenize data for training set
    train_encodings = tokenize_function(train_texts, tokenizer_dict[model_name])
    val_encodings = tokenize_function(val_texts, tokenizer_dict[model_name])

    # Create dataset for training
    train_dataset = TextDataset(train_encodings, train_labels)

    val_dataset = TextDataset(val_encodings, val_labels)
    # Initialize the model with pretrained weights
    model = model_class.from_pretrained(pretrained_model, num_labels=len(label_encoder.classes_))

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_{model_name}',
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name}',
        logging_steps=50,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        fp16=True,
        learning_rate=2e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=3,
        load_best_model_at_end=True,
    )

    # Create Trainer instance for training
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

    # Train the model
    trainer.train()

    # List all checkpoint directories
    checkpoint_dirs = [d for d in os.listdir(f'./results_{model_name}') if d.startswith('checkpoint-')]
    best_f1 = 0
    best_checkpoint = None
    best_report = ""

    best_predictions = None

    # Evaluate each checkpoint
    for checkpoint_dir in checkpoint_dirs:
        checkpoint_path = os.path.join(f'./results_{model_name}', checkpoint_dir)
        if not os.path.isdir(checkpoint_path):
            continue

        try:
            test_f1, y_test_pred = evaluate_model(model_class, tokenizer_dict[model_name], test_texts, test_labels, checkpoint_path)

            # Track the best performing checkpoint
            if test_f1 > best_f1:
                best_f1 = test_f1
                best_checkpoint = checkpoint_path
                best_report = classification_report(test_labels, y_test_pred)
                best_predictions = y_test_pred
        except Exception as e:
            print(f"Error evaluating checkpoint {checkpoint_path}: {e}")

    # Store results
    results.append({
        'Model': model_name,
        'Best Checkpoint': best_checkpoint,
        'Test F1': best_f1,
        'Classification Report': best_report
    })
    test_data['Predicted Labels'] = label_encoder.inverse_transform(best_predictions)
    test_data['Actual Labels'] = label_encoder.inverse_transform(test_labels)
    test_data[['datasheet_link', 'Predicted Labels', 'Actual Labels']].to_excel(f'predicted_{model_name}.xlsx', index=False)



Training and evaluating bert...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.274405
2,1.338700,1.143726
3,1.338700,0.948968
4,1.133900,0.754839
5,0.790400,0.512946








Training and evaluating scibert...


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.301696
2,1.347100,1.074514
3,1.347100,0.755692
4,1.020300,0.374833
5,0.460800,0.196873








Training and evaluating xlm-roberta...


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.451225
2,1.420300,1.182503
3,1.420300,0.912475
4,1.157100,0.652614
5,0.839000,0.465145




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




In [6]:
# Display results as a DataFrame
results_df = pd.DataFrame(results)
print("\nComparison of Models:")
results_df
results_df.to_excel('LLM_comparison_results.xlsx')


Comparison of Models:


In [7]:
def classify_text(text, tokenizer, model):
    # Preprocess and tokenize the input text
    encodings = tokenize_function([text], tokenizer)

    # Create a dataset for the input text
    input_dataset = TextDataset(encodings, [0])  # Dummy label [0] because we only need the input for prediction

    # Create a Trainer instance for prediction
    training_args = TrainingArguments(
        per_device_eval_batch_size=1,  # Single batch size for prediction
        output_dir='./results',
        evaluation_strategy="no",
        do_train=False,
        do_eval=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args
    )

    # Predict the label
    predictions = trainer.predict(input_dataset)
    predicted_label_idx = np.argmax(predictions.predictions, axis=1)[0]

    # Convert numerical label back to string label
    predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]

    return predicted_label


In [8]:
def main():
    url = input("Enter the URL to classify: ")

    # Extract text from URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_text = soup.get_text()

    # Use the best model and tokenizer
    best_model_name = results_df.loc[results_df['Test F1'].idxmax(), 'Model']
    best_tokenizer = tokenizer_dict[best_model_name]
    best_model = models[best_model_name][0].from_pretrained(results_df.loc[results_df['Test F1'].idxmax(), 'Best Checkpoint'])

    # Classify the extracted text
    predicted_label = classify_text(page_text, best_tokenizer, best_model)
    print(f"Predicted Label: {predicted_label}")


main()




Enter the URL to classify: https://kenall.com/Kenall-Files/Product-Files/SpecificationSheets/MPH.pdf




Predicted Label: lighting


In [9]:
main()

Enter the URL to classify: https://lumenart.com/images/alume/awl-01_specs.pdf




Predicted Label: cable


In [10]:
main()

Enter the URL to classify: https://www.alphawire.com/disteAPI/SpecPDF/DownloadProductSpecPdf?productPartNumber=9438




Predicted Label: cable


In [11]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [17]:
# Specify the path to the Google Drive directory
import shutil
drive_directory = '/content/drive/MyDrive/'

# Move the zip file to Google Drive
shutil.move('results_xlm-roberta', drive_directory + 'results_xlm-roberta.zip')


'/content/drive/MyDrive/results_xlm-roberta.zip'