<a href="https://colab.research.google.com/github/Everysimo/Muxi_SE4AI/blob/main/Muxi_BERT_NamedEntityK_FoldOpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from simpletransformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers (from simpletransformers)
  

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from six.moves import urllib
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel, NERArgs
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from nltk.stem import SnowballStemmer

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams['figure.figsize'] = (20,10)

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/Everysimo/Muxi_SE4AI/main/NERSchedulaMeetingDataSet.csv"

# Fetch dataset da GitHub
def fetch_file_data1(file_url1=DOWNLOAD_ROOT, file_path1="file"):
    os.makedirs(file_path1, exist_ok=True)
    csv_path1 = os.path.join(file_path1, "IntentsMuxyDataset.csv")
    urllib.request.urlretrieve(file_url1, csv_path1)

fetch_file_data1()

# Carica il dataset
data = pd.read_csv("file/IntentsMuxyDataset.csv")
data = data.fillna(method="ffill")
data.rename(columns={"Sentence #": "sentence_id", "Word": "words", "Tag": "labels"}, inplace=True)
data["labels"] = data["labels"].str.upper()

X = data[["sentence_id", "words"]]
Y = data["labels"]

# Definisci il numero di fold K
k = 5

# Inizializza il KFold
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

#Rimozione delle punteggiature
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

#Conversione in minuscolo
def to_lowercase(text):
    return text.lower()


#Applica le funzioni di preprocessing al dataset
data["words"] = data["words"].apply(remove_punctuation)
data["words"] = data["words"].apply(to_lowercase)

precision_scores = []
recall_scores = []
f1_scores = []
eval_loss_scores = []

for train_index, test_index in kfold.split(X):
    # Dividi il dataset in train set e test set per il fold corrente
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Building up train data and test data
    train_data = pd.DataFrame({"sentence_id": x_train["sentence_id"], "words": x_train["words"], "labels": y_train})
    test_data = pd.DataFrame({"sentence_id": x_test["sentence_id"], "words": x_test["words"], "labels": y_test})




    # Model Training
    from simpletransformers.ner import NERModel, NERArgs

    label = data["labels"].unique().tolist()

    args = NERArgs()
    args.num_train_epochs = 30
    args.learning_rate = 1e-4
    args.overwrite_output_dir = True
    args.train_batch_size = 32
    args.eval_batch_size = 32

    model = NERModel('bert', 'bert-base-cased', labels=label, args=args)

    optimizer = AdamW(model.model.parameters(), lr=args.learning_rate)  # Accediamo ai parametri del modello interno
    total_steps = len(train_data) * args.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Addestramento del modello con l'ottimizzatore personalizzato
    model.train_model(train_data, eval_data=test_data, optimizer=optimizer, scheduler=scheduler, acc=accuracy_score)

    # Evaluate the model on test data
    result, model_outputs, preds_list = model.eval_model(test_data)

    print(result.keys())  # Print the keys in the result dictionary

    # Calculate accuracy score for the fold
    precision_scores.append(result['precision'])
    recall_scores.append(result['recall'])
    f1_scores.append(result['f1_score'])
    eval_loss_scores.append(result['eval_loss'])

# Calculate the average accuracy score across all folds
average_precision_score = sum(precision_scores) / k
average_recall_score = sum(recall_scores) / k
average_f1_scores_score = sum(f1_scores) / k
average_eval_loss_score = sum(eval_loss_scores) / k

print(f"Average precision Score: {average_precision_score}")
print(f"Average recall Score: {average_recall_score}")
print(f"Average f1_scores Score: {average_f1_scores_score}")
print(f"Average eval_loss Score: {average_eval_loss_score}")

# Save model
model.model.save_pretrained('model1')
model.tokenizer.save_pretrained('model1')
model.config.save_pretrained('model1/')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Running Epoch 0 of 30:   0%|          | 0/1 [00:00<?, ?it/s]



Running Epoch 1 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 2 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
prediction, model_output = model.predict(["crea una riunione nominata ciao amici per parlare delle pubbliche relazioni il due gennaio alle quindici e mezza fino alle sedici"])
prediction

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'crea': 'O'},
  {'una': 'O'},
  {'riunione': 'O'},
  {'nominata': 'O'},
  {'ciao': 'B_NOME-MEETING'},
  {'amici': 'B_NOME-MEETING'},
  {'per': 'O'},
  {'parlare': 'B_DESCR'},
  {'delle': 'B_DESCR'},
  {'pubbliche': 'B_DESCR'},
  {'relazioni': 'B_DESCR'},
  {'il': 'O'},
  {'due': 'B_GIORNO'},
  {'gennaio': 'B_MESE'},
  {'alle': 'O'},
  {'quindici': 'B_ORA-FINE'},
  {'e': 'B_ORA-INIZ'},
  {'mezza': 'B_ORA-INIZ'},
  {'fino': 'O'},
  {'alle': 'O'},
  {'sedici': 'B_ORA-FINE'}]]

In [None]:
import pickle
with open("model.pkl", 'wb') as file:
        pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)