In [6]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [7]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [None]:
# Getting Valid Data
valid_data = pd.read_csv('/content/valid_tenders.txt', sep='\r', header=None)
valid_data.columns = ['description']
valid_data['label'] = 1
# Getting Non Valid Data
not_valid_data = pd.read_csv('/content/not_valid_tenders.txt', sep='\r', header=None)
not_valid_data.columns = ['description']
not_valid_data['label'] = 0
# Concatenate and Shuffle
df = pd.concat([valid_data, not_valid_data]).sample(frac=1).reset_index(drop=True)

In [None]:
df

Unnamed: 0,description,label
0,"Location de l’épicerie, de la buvette, du cent...",0
1,Report de l'appel d'offre relatif aux travaux ...,0
2,Acquisition et installation d'une plateforme w...,1
3,Acquisition de fournitures de bureau pour l'an...,0
4,Désignation d'un réviseur des comptes juridiques,0
...,...,...
4538,désignation d'un commissaire aux comptes de la...,0
4539,Etude géotechnique et contrôle de la qualité d...,0
4540,"Fourniture, installation, mise en service et f...",0
4541,Acquisition des équipements informatiques,1


In [None]:
# Set the device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Dataset
descriptions = df["description"].astype(str).tolist() # List of bid offer descriptions
labels = df["label"].tolist()  # List of corresponding labels (0 or 1)

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    descriptions, labels, test_size=0.2, random_state=42
)

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Convert the tokenized data into PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels)
)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

lr = 1e-5
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}: Accuracy = {accuracy:.2f}%")

Epoch 1/5: Accuracy = 91.42%
Epoch 2/5: Accuracy = 88.78%
Epoch 3/5: Accuracy = 93.73%
Epoch 4/5: Accuracy = 95.05%
Epoch 5/5: Accuracy = 94.06%


In [None]:
# Save the trained model
model.save_pretrained("bert_classifier_model")
tokenizer.save_pretrained("bert_classifier_model")

('bert_classifier_model/tokenizer_config.json',
 'bert_classifier_model/special_tokens_map.json',
 'bert_classifier_model/vocab.txt',
 'bert_classifier_model/added_tokens.json')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Saving The Model in drive
import shutil
source = '/content/bert_classifier_model/'  # Replace with the path to the source directory
destination = '/content/drive/MyDrive/bert_classifier_model'  # Replace with the path to the destination directory
shutil.copytree(destination, source)

'/content/bert_classifier_model/'

### Testing The Model

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert_classifier_model")
model = BertForSequenceClassification.from_pretrained("bert_classifier_model")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
# Prepare the input data

example_texts = [
    "materiel informatique", # 1
    "Building  Web Site", # 1
    "Maintenance des équipements informatiques de vente des billets de banlieues (Tunis et Sahel)", # 1
    "Travaux de badigeonnage du siège social et l’annexe de l’OTC ", # 0
    "Acquisition de fourniture de bureau et consommables informatiques CPSCL 2023"
]

# Tokenize the input data
encoded_inputs = tokenizer(example_texts, truncation=True, padding=True, return_tensors='pt')
input_ids = encoded_inputs['input_ids'].to(device)
attention_mask = encoded_inputs['attention_mask'].to(device)

# Inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)

# Get the predicted labels
predicted_labels = predicted_labels.detach().cpu().tolist()

# Print the results
for text, label in zip(example_texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label}")
    print()

Text: materiel informatique
Predicted Label: 1

Text: Building  Web Site
Predicted Label: 1

Text: Maintenance des équipements informatiques de vente des billets de banlieues (Tunis et Sahel)
Predicted Label: 1

Text: Travaux de badigeonnage du siège social et l’annexe de l’OTC 
Predicted Label: 0

Text: Acquisition de fourniture de bureau et consommables informatiques CPSCL 2023
Predicted Label: 0

