#1. Introduction

Load the Google Drive folder and install `simpletransformer`

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks/NaiveBayes

/content/drive/My Drive/Colab Notebooks/NaiveBayes


#2. Data

Load the source data and create training and testing datasets.

In [None]:
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split

data_folder = '../source_data'

def extract_sentences(text):
    sentences = re.split(r'\.', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences, labels = [], []

for class_folder in os.listdir(data_folder):
    class_path = os.path.join(data_folder, class_folder)
    if not os.path.isdir(class_path) or class_folder.startswith('.'):
        continue
    for doc_file in os.listdir(class_path):
        doc_path = os.path.join(class_path, doc_file)
        with open(doc_path, 'r', encoding='utf-8') as f:
            text = f.read()
            doc_sentences = extract_sentences(text)
            sentences.extend(doc_sentences)
            labels.extend([class_folder] * len(doc_sentences))

label_to_int = {'LETTA Enrico': 0, 'MELONI Giorgia': 1, 'CONTE Giuseppe': 2, 'DRAGHI Mario': 3, 'RENZI Matteo': 4, 'GENTILONI SILVERI Paolo': 5}
int_to_label = {value:key for key,value in label_to_int.items()}


print('Number of Training Data:', len(sentences))


X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.1, stratify=labels, random_state=1946)



Number of Training Data: 24480


#3. Train and Evaluate a NaiveBayes classifier

Create classification NaiveBayes classifier using `scikit-learn`.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=100000)),
    ('nb', MultinomialNB(alpha=0.01))
])

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


Classification Report:
                         precision    recall  f1-score   support

         CONTE Giuseppe       0.82      0.88      0.85       829
           DRAGHI Mario       0.67      0.58      0.62       266
GENTILONI SILVERI Paolo       0.70      0.56      0.62       139
           LETTA Enrico       0.66      0.52      0.59       242
         MELONI Giorgia       0.80      0.87      0.83       690
           RENZI Matteo       0.74      0.74      0.74       282

               accuracy                           0.77      2448
              macro avg       0.73      0.69      0.71      2448
           weighted avg       0.77      0.77      0.77      2448

