In [None]:
!pip install torch==2.2.2 
!pip install transformers==4.32.1 
!pip install seqeval==1.2.2 
!pip install accelerate

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import logging
import time
import os
import csv
from bs4 import BeautifulSoup

In [5]:
class ContactLinkModel:
    
    def __init__(self, model_name='bert-base-cased', num_labels=2, max_length=40):
        self.model_name = model_name
        self.num_labels = num_labels
        self.max_length = max_length
        self.tokenizer = None
        self.model = None
        self.linkProcessing = LinkProcessing()
        
    def load_from_huggingface(self):
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels= self.num_labels
        )

    def load_from_local(self, tokenizer_path='bert-base-cased', model_path='./Models/model_0/model_contact_40_maxlen_10_epochs'):
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = BertForSequenceClassification.from_pretrained(model_path)
    
    def preprocess(self, texts, truncation=True, padding=True):
        return self.tokenizer(
            texts, 
            padding=padding,
            truncation=truncation,
            max_length=self.max_length, 
            return_tensors="pt"
        )

    def get_original_tokens(self, input_ids):
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        return tokens

    def compute_metrics(self, preds, labels):
        preds = preds.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        recall = recall_score(labels, preds, average='binary')
        precision = precision_score(labels, preds, average='binary')
        f1 = f1_score(labels, preds, average='binary')
        return accuracy, precision, recall, f1

    def train(self, train_texts, train_labels, val_texts, val_labels, num_epochs, batch_size):
        
        os.makedirs("./Models/model_0", exist_ok=True)
        model_save_path = f"./Models/model_0/model_contact_{self.max_length}_maxlen_{num_epochs}_epochs"

        train_encodings = self.preprocess(train_texts)
        val_encodings = self.preprocess(val_texts)

        train_dataset = Dataset(train_encodings, train_labels)
        val_dataset = Dataset(val_encodings, val_labels)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        
        optimizer = AdamW(self.model.parameters(), lr=2e-5)
        total_steps = len(train_loader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


        csv_filename = f"{model_save_path}_info.csv"
        header = ["training_details"]
        is_empty = True
        
        start_time = time.time()
        for epoch in range(num_epochs):
            self.model.train()
            total_loss = 0
            for batch in train_loader:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
                scheduler.step()

            avg_train_loss = total_loss / len(train_loader)

            # Validation
            self.model.eval()
            val_preds, val_labels = [], []
            val_total_loss = 0.0
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    logits = outputs.logits
                    val_preds.extend(logits.detach().cpu().numpy())
                    val_labels.extend(labels.cpu().numpy())
                    loss = outputs.loss
                    val_total_loss += loss.item()

            avg_val_loss = val_total_loss / len(val_loader)

            val_preds = np.array(val_preds)
            val_labels = np.array(val_labels)
            accuracy, precision, recall, f1 = self.compute_metrics(val_preds, val_labels)

            training_details = (f"Époque {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f} - "
                                f"Validation Loss: {avg_val_loss:.4f} - Validation Accuracy: {accuracy:.4f} - "
                                f"Precision: {precision:.4f} - Recall: {recall:.4f} - F1 Score: {f1:.4f}")
            print(training_details)
        
            with open(csv_filename, 'a', newline='', encoding='utf-8-sig') as csvfile:
                csv_writer = csv.writer(csvfile)
                if is_empty:
                    csv_writer.writerow(header)
                    is_empty = False
                csv_writer.writerow([training_details])

        end_time = time.time()
        total_fine_tuning_time = end_time - start_time
        
        training_details = f"Fine-tuning terminé! Temps total: {total_fine_tuning_time:.2f} secondes | {total_fine_tuning_time/60:.2f} min | {total_fine_tuning_time/3600:.2f} hours"
        print(training_details)

        # Enregistrer le modèle finetuné
        self.model.save_pretrained(model_save_path)
        print(f"Modèle enregistré à {model_save_path}")
        
        with open(csv_filename, 'a', newline='', encoding='utf-8-sig') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow([training_details])
            csv_writer.writerow([f"Modèle enregistré à {model_save_path}"])


    def predict(self, text):
        inputs = self.preprocess([text])
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predictions = predictions.cpu().detach().numpy()
        predictions = np.argmax(predictions)
        return predictions

    def predict_label_links(self, cleaned_links):
        link_name_label = []
        for i in range(len(cleaned_links)):
            predictions = self.predict(cleaned_links[i][1])
            link_name_label.append((cleaned_links[i][0], cleaned_links[i][1], predictions))
        return link_name_label
        
    def get_contact_links(self, htmlContent):
        links = self.linkProcessing.preprocess_links(htmlContent)
        predictedLinks = self.predict_label_links(links)
        return [link for link in predictedLinks if link[2]==1]

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [9]:
class LinkProcessing :
    def __init__(self):
        self.max_len_link_name = 6
        
    def extract_links(self, contenu_html):
        soup = BeautifulSoup(contenu_html, 'html.parser')
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.get_text(strip=True)
            links.append((href, text))
        return links

    def remove_empty_links(self, links):
        cleaned_links = [(href, text) for href, text in links if href.strip() not in ("#", "")]
        return cleaned_links

    def filter_valid_name_links(self, links):
        cleaned_links = [(href, text) for href, text in links if text.strip() and len(text.split()) <= self.max_len_link_name]
        return cleaned_links

    def preprocess_links(self, contenu_html):
        links = self.extract_links(contenu_html)
        cleaned_links = self.remove_empty_links(links)
        cleaned_links = self.filter_valid_name_links(cleaned_links)
        return cleaned_links

In [11]:
data = pd.read_csv('./data/LINK_CONTACT_DATA_2.csv')
data

Unnamed: 0,link_name,label
0,contact,1
1,contactez-nous,1
2,Contactez nous,1
3,nous contacter,1
4,الإتصال,1
...,...,...
9948,الاتصال بنا,1
9949,اتصل بنا,1
9950,من نحن,1
9951,إتصل بنا,1


In [13]:
X = list(data["link_name"])
y = list(data["label"])

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [15]:
# Model parameters
model_name = 'bert-base-cased'
num_labels = 2
max_length = 40

In [17]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [19]:
# Initialize the classifier
classifier = ContactLinkModel(model_name, num_labels, max_length)
classifier.load_from_huggingface()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
classifier.train(X_train, y_train, X_val, y_val, num_epochs=30, batch_size=8)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


## Test Model

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     ----------------------------------- -- 41.0/44.4 kB 326.8 kB/s eta 0:00:01
     -------------------------------------- 44.4/44.4 kB 313.2 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ---------------

In [1]:
from ContactLinkModel import ContactLinkModel

classifier = ContactLinkModel()
classifier.load_from_local(model_path='./Models/model_0/model_contact_40_maxlen_10_epochs')

text = """<a class="nav-link" href="https://www.hespress.com/contact1"><div>اعمل معنا<div></a>
<a class="nav-link" href="https://www.hespress.com/contact1"><div>contactez-nous<div></a>
<a class="nav-link" href="#"><div>aaa<div></a>
<a class="nav-link" href="https://www.hespress.com/contact1"><div>bbb<div></a>
        <li class="menu-item nav-item"><a class="nav-link" href="https://jdjd/jd">cc cc cc cc cc cc cc cc</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">text text</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">contact</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">Informations de contact</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">هيئة التحرير</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">من نحن؟</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">اتصال</a>
        <a class="nav-link" href="https://www.hespress.com/contact2">economie</a>
        </li>"""

test = classifier.get_contact_links(text)
test

[('https://www.hespress.com/contact1', 'اعمل معنا'),
 ('https://www.hespress.com/contact1', 'contactez-nous'),
 ('https://www.hespress.com/contact2', 'contact'),
 ('https://www.hespress.com/contact2', 'Informations de contact'),
 ('https://www.hespress.com/contact2', 'هيئة التحرير'),
 ('https://www.hespress.com/contact2', 'من نحن؟'),
 ('https://www.hespress.com/contact2', 'اتصال')]