In [None]:
pip install transformers datasets torch scikit-learn


In [2]:
import json

# Load dataset
with open("/kaggle/input/jsonexpre/dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    
print(data)
from datasets import Dataset
# Convert the JSON into a Hugging Face Dataset
dataset = Dataset.from_list(data)

[{'expression': 'Réserve légale', 'tag': 'StatutoryReserve'}, {'expression': 'Réserve de capitaux', 'tag': 'CapitalReserve'}, {'expression': 'Capital versé additionnel', 'tag': 'AdditionalPaidinCapital'}, {'expression': 'Actifs courants', 'tag': 'CurrentAssets'}]


In [5]:
from transformers import AutoTokenizer

model_name = "bert-base-multilingual-cased"  # or "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [6]:
def tokenize_data(example):
    return tokenizer(example["expression"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_data, batched=True)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [7]:
# Create a mapping from tag to ID
unique_tags = list(set([entry["tag"] for entry in data]))
tag_to_id = {tag: idx for idx, tag in enumerate(unique_tags)}

# Add numerical labels to the dataset
def add_labels(example):
    example["label"] = tag_to_id[example["tag"]]
    return example

labeled_dataset = tokenized_dataset.map(add_labels)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [8]:
from datasets import DatasetDict

# Split into train and validation sets
dataset_split = labeled_dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split["train"]
val_dataset = dataset_split["test"]


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_tags)  
)


In [15]:
print(f"Model architecture:\n{model}")
print(f"\nNumber of parameters: {model.num_parameters():,}")
print(f"Number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Model architecture:
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)




In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
from transformers import Trainer
from tqdm import tqdm

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

total_steps = trainer.args.num_train_epochs * len(train_dataset) // trainer.args.per_device_train_batch_size

with tqdm(total=total_steps, desc="Training Progress") as pbar:
    
    for epoch in range(int(trainer.args.num_train_epochs)):
        trainer.train()
        pbar.update(len(train_dataset) // trainer.args.per_device_train_batch_size)


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
model.save_pretrained("./AmelkisTAG")
tokenizer.save_pretrained("./AmelkisTAG")


In [None]:
from transformers import pipeline

# Load the trained model
classifier = pipeline("text-classification", model="./AmelkisTAG", tokenizer="./AmelkisTAG", return_all_scores=True)

# Predict
text = "Réserve légale"
prediction = classifier(text)
print(prediction)
