In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

In [3]:
import pickle5 as pickle
import numpy as np
import json

In [4]:
with open("../Corpus-Charles-Bruneau/outdir/merged.pkl", "rb") as fh:
    data = pickle.load(fh)
data = data.dropna()

In [5]:
annotated_data = pd.DataFrame(data[data.annotations.apply(len)>0])
annotated_data.head()

Unnamed: 0,text,annotations,sentence_id,type,chapter
48,Les gendarmes ne m'autorisaient pas à aller à ...,[simple mention],33,Note,1
61,Je n'ai pas oublié mon effarement le jour où l...,[simple mention],45,Note,1
64,"Mon père,que la sous préfecture avait oublié d...",[simple mention],48,Note,1
91,On dit encore que les forts du Camp des Romain...,[simple mention],75,Letter to Family,2
93,Position très importante : et d'où les Alleman...,[simple mention],77,Letter to Family,2


In [6]:
# see how much data is annotated
data.shape , annotated_data.shape

((7635, 5), (580, 5))

In [39]:
annotated_data.text.iloc[0]

'Les gendarmes ne m\'autorisaient pas à aller à Nancy,"qui était peut-être aux mains des Allemands".'

In [7]:
# data.annotations.value_counts()
data.annotations.apply(set).apply(list).apply(sorted).apply(tuple).value_counts()

()                                                                               7055
(simple mention,)                                                                 385
(mention with negative sentiment,)                                                124
(mention with stereotype(s),)                                                      48
(mention with negative sentiment, mention with stereotype(s))                      16
(mention with stereotype(s), simple mention)                                        4
(mention with negative sentiment, simple mention)                                   2
(mention with negative sentiment, mention with stereotype(s), simple mention)       1
Name: annotations, dtype: int64

In [8]:
pd.Series(sum([item for item in data.annotations], [])).value_counts()

simple mention                     392
mention with negative sentiment    143
mention with stereotype(s)          69
dtype: int64

In [9]:
def check_characterization(col):
    return 1 if len(col)>0 else 0

In [10]:
data['is_characterization'] = data.annotations.apply(check_characterization)
data['is_characterization'].value_counts()

0    7055
1     580
Name: is_characterization, dtype: int64

In [11]:
data.shape, data[data.annotations.apply(len)>0].shape, data.type.shape

((7635, 6), (580, 6), (7635,))

## Split data into train,dev,test

In [12]:
vectorizer = TfidfVectorizer()
bow = np.array(vectorizer.fit_transform(data.text.values).todense())

In [13]:
# Let's split the data in 80:10:10 for train:dev:test dataset
train_size=0.8
X = bow
y = data['is_characterization']

In [14]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape,  y_test.shape)

(6108, 9983) (6108,)
(763, 9983) (763,)
(764, 9983) (764,)


### 0. Random classifier

In [15]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [16]:
dummy_clf.fit(X_train, y_train)
dummy_pred = dummy_clf.predict(X_test)
f1_score(y_test, dummy_pred, average='weighted')

0.8838038717776795

### 1. Binary classification with MNB

In [17]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [18]:
print(f1_score(y_test, y_pred, average=None)) 
print(accuracy_score(y_test, y_pred))

[0.95841854 0.        ]
0.9201570680628273


In [19]:
f1_score(y_test, y_pred, average='weighted')

0.8831500694154469

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96       704
           1       0.00      0.00      0.00        60

    accuracy                           0.92       764
   macro avg       0.46      0.50      0.48       764
weighted avg       0.85      0.92      0.88       764



### 2. Binary classification with SVM

In [21]:
from sklearn.svm import LinearSVC
svm = LinearSVC(random_state=42).fit(X_train, y_train)

In [22]:
svm_pred = svm.predict(X_test)

In [23]:
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')
print(f'Accuracy (linear Kernel): {(svm_accuracy*100)}')
print(f'F1 (linear Kernel): {svm_f1:.2f}')

Accuracy (linear Kernel): 96.33507853403141
F1 (linear Kernel): 0.96


In [24]:
# 0: no characterization, 1:characterization
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       704
           1       0.94      0.57      0.71        60

    accuracy                           0.96       764
   macro avg       0.95      0.78      0.84       764
weighted avg       0.96      0.96      0.96       764



In [25]:
f1_score(y_test, svm_pred, average='weighted')

0.9590767498318173

### 3. Pretrained transformer with classification head

In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [44]:
# Import generic wrappers
from transformers import AutoModelForSequenceClassification, AutoTokenizer 
import torch

# Define the model repo
model_name = "moussaKam/barthez-sentiment-classification" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [46]:
# model

In [None]:
# Transform input tokens 
inputs = tokenizer("Hello world!", return_tensors="pt")

# Model apply
outputs = model(**inputs)

In [41]:
text = "Les gendarmes ne m\'autorisaient pas à aller à Nancy, qui était peut-être aux mains des Allemands"

| Method | F1 Score on test data (weighted avg)|
| --- | --- | 
| Random (baseline) |0.8553  | 
| MNB | 0.8553 | 
| SVM | 0.9508 | 
| Transformers |  | 

