#1. Introduction

Load the Google Drive folder and install `simpletransformer`

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/BERT

/content/drive/My Drive/Colab Notebooks/BERT


In [None]:
# install simpletransformers
!pip install simpletransformers

#2. Data

Load the source data and create training and testing datasets.

In [None]:
import os
import re
from sklearn.model_selection import train_test_split

data_folder = '../source_data'

def extract_sentences(text):
    sentences = re.split(r'\.', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences, labels = [], []

for class_folder in os.listdir(data_folder):
    class_path = os.path.join(data_folder, class_folder)
    if not os.path.isdir(class_path) or class_folder.startswith('.'):
        continue
    for doc_file in os.listdir(class_path):
        doc_path = os.path.join(class_path, doc_file)
        with open(doc_path, 'r', encoding='utf-8') as f:
            text = f.read()
            doc_sentences = extract_sentences(text)
            sentences.extend(doc_sentences)
            labels.extend([class_folder] * len(doc_sentences))

label_to_int = {'LETTA Enrico': 0, 'MELONI Giorgia': 1, 'CONTE Giuseppe': 2, 'DRAGHI Mario': 3, 'RENZI Matteo': 4, 'GENTILONI SILVERI Paolo': 5}
int_to_label = {value:key for key,value in label_to_int.items()}


print('Number of Training Data:', len(sentences))


X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.1, stratify=labels, random_state=1946)




Number of Training Data: 24480


#3. Create Language Model

Create classification model using the HuggingFace model `bert-base-italian-xxl-cased`

In [None]:
from simpletransformers.classification import ClassificationModel
import torch


cuda_available = torch.cuda.is_available()

# define hyperparameter
train_args ={"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "num_train_epochs": 5,
             "save_eval_checkpoints" : False,
             "save_model_every_epoch": False,
             "save_optimizer_and_scheduler" : False,
             "save_steps": False}

# Create a ClassificationModel
model = ClassificationModel(
    #"bert", "dbmdz/bert-base-italian-xxl-cased", # when the model is first trained, we must fetch it directly from HuggingFace
    "bert", "fine_tuned_bert-base-italian-xxl-cased_5", # when the model has been trained, we can fetch a local copy
    use_cuda=cuda_available,
    num_labels=6,
    args=train_args
)

#4. Prepare training data

Create a panda DataFrame to train the model

In [None]:
import pandas as pd

train_df = pd.DataFrame([[X,label_to_int[y]] for X,y in zip(X_train,y_train)], columns=['text', 'label'])
test_df = pd.DataFrame([[X,label_to_int[y]] for X,y in zip(X_test,y_test)], columns=['text', 'label'])

print(train_df.shape)
print(test_df.shape)




(22032, 2)
(2448, 2)


#5. Train and save model (only the first time)

The model is trained and saved on the first run. Then, by calling a local fine-tuned version of the model at step 3) instead of the main model, we can load the fine-tuned version.

In [None]:
'''
model.train_model(train_df)

model.model.save_pretrained('fine_tuned_bert-base-italian-xxl-cased_5')
model.tokenizer.save_pretrained('fine_tuned_bert-base-italian-xxl-cased_5')
model.config.save_pretrained('fine_tuned_bert-base-italian-xxl-cased_5')
'''

#6. Predict

Here we predict the labels associated with the test dataset.

In [None]:
predictions, raw_outputs = model.predict(X_test)

  0%|          | 0/2448 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

#7. Evaluation

Finally, we use `scikit-learn` to evaluate the model.

In [None]:
from sklearn.metrics import classification_report

print('\nClassification Report:')
print(classification_report(predictions, test_df['label']))



Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.67      0.63       221
           1       0.89      0.82      0.85       748
           2       0.88      0.88      0.88       832
           3       0.73      0.70      0.71       280
           4       0.64      0.78      0.70       232
           5       0.65      0.67      0.66       135

    accuracy                           0.80      2448
   macro avg       0.73      0.75      0.74      2448
weighted avg       0.81      0.80      0.80      2448

