In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173664e-02 -4.28515933e-02 -1.56286228e-02  1.40537536e-02
  3.95537503e-02  1.21796280e-01  2.94333547e-02 -3.17523777e-02
  3.54959592e-02 -7.93140158e-02  1.75878201e-02 -4.04369757e-02
  4.97259907e-02  2.54912470e-02 -7.18699768e-02  8.14968571e-02
  1.47072668e-03  4.79627103e-02 -4.50335629e-02 -9.92174819e-02
 -2.81769522e-02  6.45045862e-02  4.44670357e-02 -4.76217456e-02
 -3.52952294e-02  4.38671485e-02 -5.28565794e-02  4.33043257e-04
  1.01921476e-01  1.64072420e-02  3.26996706e-02 -3.45986746e-02
  1.21339457e-02  7.94871226e-02  4.58339555e-03  1.57778654e-02
 -9.68205370e-03  2.87626162e-02 -5.05806580e-02 -1.55793857e-02
 -2.87907068e-02 -9.62280296e-03  3.15556414e-02  2.27349475e-02
  8.71449634e-02 -3.85027677e-02 -8.84718746e-02 -8.75497516e-03
 -2.12343577e-02  2.08924040e-02 -9.02078077e-02 -5.25732227e-02
 -1.05638755e-02  2.88311057e-02 -1.61454957e-02  6.17838651e-03
 -1.23234

In [2]:
import sys
sys.path.append('./src')

from src.data import load_data_part1, CustomAnalyzer
# from src.eval import fit_eval
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

In [3]:
X, y = load_data_part1()
X_embeded = model.encode(X)

In [4]:
X_embeded.shape, y.shape

((57413, 384), (57413,))

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, roc_auc_score

def print_score(y_test, pred, name):
    macro_f1 = f1_score(y_test, pred, average="macro")
    micro_f1 = f1_score(y_test, pred, average="micro")
    macro_auc = roc_auc_score(y_test, pred)
    micro_auc = roc_auc_score(y_test, pred, average="weighted")
    classif_report = classification_report(y_test, pred, output_dict=True)

    reports = f"""
    {name} :
    =====
    Macro F1-score : {macro_f1}
    Micro F1-score : {micro_f1}
    Macro ROC-AUC: {macro_auc}
    Weighted ROC-AUC: {micro_auc}
    Classification report :
    {classification_report(y_test, pred)}
    =====
    """
    print(reports)

    classif_report["macro_auc"] = macro_auc
    classif_report["micro_auc"] = micro_auc
    return classif_report


def fit_eval(X_train, y_train, X_test, y_test, balanced=None):
    """
    Entraine et évalue les algorithmes classiques de classification à partir des 
    différents sets de données.

    Parameters
    ----------
    X_train: Sparse Matrix
        Important : Une matrice BoW est attendu

    X_test: Sparse Matrix
        Important : Une matrice BoW est attendu

    y_train: list
        Label des données de train

    y_test: list
        Label des données de test

    balanced: None ou autre
    """
    # Naïve Bayes
    # if balanced is None:
    #     nb_clf = MultinomialNB()
    # else:
    #     balanced = "balanced"
    #     nb_clf = MultinomialNB(fit_prior=True)
    # nb_clf.fit(X_train, y_train)

    # Logistic Regression
    lr_clf = LogisticRegression(
        random_state=0, solver="lbfgs", n_jobs=-1, max_iter=10000, class_weight=balanced
    )
    lr_clf.fit(X_train, y_train)

    # Linear SVM
    svm_clf = LinearSVC(random_state=0, tol=1e-5, max_iter=20000, class_weight=balanced)
    svm_clf.fit(X_train, y_train)

    # pred_nb = nb_clf.predict(X_test)
    pred_lr = lr_clf.predict(X_test)
    pred_svm = svm_clf.predict(X_test)

    # Ridge Classifier ?
    results = (
        # print_score(y_test, pred_nb, "Naïve Bayes"),
        print_score(y_test, pred_lr, "Logistic Regression"),
        print_score(y_test, pred_svm, "SVM"),
    )
    algo_names = ["Logistic Regression", "SVM"]

    return results, algo_names

In [6]:
X_embeded_train, X_embeded_test, y_train, y_test = train_test_split(X_embeded, y, stratify=y)
fit_eval(X_embeded_train, y_train, X_embeded_test, y_test, balanced='balanced')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

(({'-1': {'precision': 0.25957011258955986,
    'recall': 0.6741095162147793,
    'f1-score': 0.37481525273425953,
    'support': 1881},
   '1': {'precision': 0.9352624353152392,
    'recall': 0.7100136294395896,
    'f1-score': 0.8072190319934374,
    'support': 12473},
   'accuracy': 0.7053086247735822,
   'macro avg': {'precision': 0.5974162739523996,
    'recall': 0.6920615728271844,
    'f1-score': 0.5910171423638484,
    'support': 14354},
   'weighted avg': {'precision': 0.8467172730575407,
    'recall': 0.7053086247735822,
    'f1-score': 0.7505552791171302,
    'support': 14354},
   'macro_auc': 0.6920615728271845,
   'micro_auc': 0.6920615728271845},
  {'-1': {'precision': 0.26239067055393583,
    'recall': 0.6698564593301436,
    'f1-score': 0.3770761633996708,
    'support': 1881},
   '1': {'precision': 0.9349874371859297,
    'recall': 0.7160266174937866,
    'f1-score': 0.8109875141884223,
    'support': 12473},
   'accuracy': 0.7099763132227951,
   'macro avg': {'precisi

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
fit_eval(X_train, y_train, X_test, y_test)


    Logistic Regression :
    =====
    Macro F1-score : 0.73810028158686
    Micro F1-score : 0.9030932144350007
    Macro ROC-AUC: 0.6968400602291913
    Weighted ROC-AUC: 0.6968400602291913
    Classification report :
                  precision    recall  f1-score   support

          -1       0.73      0.42      0.53      1881
           1       0.92      0.98      0.95     12473

    accuracy                           0.90     14354
   macro avg       0.82      0.70      0.74     14354
weighted avg       0.89      0.90      0.89     14354

    =====
    

    SVM :
    =====
    Macro F1-score : 0.7313071933409601
    Micro F1-score : 0.8855371325066184
    Macro ROC-AUC: 0.7167602595769542
    Weighted ROC-AUC: 0.7167602595769542
    Classification report :
                  precision    recall  f1-score   support

          -1       0.57      0.49      0.53      1881
           1       0.92      0.95      0.93     12473

    accuracy                           0.89     14354
  

(({'-1': {'precision': 0.7268518518518519,
    'recall': 0.417331206804891,
    'f1-score': 0.5302262749071259,
    'support': 1881},
   '1': {'precision': 0.9174325749585657,
    'recall': 0.9763489136534915,
    'f1-score': 0.9459742882665941,
    'support': 12473},
   'accuracy': 0.9030932144350007,
   'macro avg': {'precision': 0.8221422134052088,
    'recall': 0.6968400602291913,
    'f1-score': 0.73810028158686,
    'support': 14354},
   'weighted avg': {'precision': 0.8924581887133568,
    'recall': 0.9030932144350007,
    'f1-score': 0.8914931671066972,
    'support': 14354},
   'macro_auc': 0.6968400602291913,
   'micro_auc': 0.6968400602291913},
  {'-1': {'precision': 0.574468085106383,
    'recall': 0.4880382775119617,
    'f1-score': 0.5277378557056627,
    'support': 1881},
   '1': {'precision': 0.9245061147695203,
    'recall': 0.9454822416419466,
    'f1-score': 0.9348765309762576,
    'support': 12473},
   'accuracy': 0.8855371325066184,
   'macro avg': {'precision': 0.

In [9]:
fit_eval(X_train, y_train, X_test, y_test, balanced='balanced')


    Logistic Regression :
    =====
    Macro F1-score : 0.7282543691980656
    Micro F1-score : 0.8528633133621291
    Macro ROC-AUC: 0.7753848578746146
    Weighted ROC-AUC: 0.7753848578746146
    Classification report :
                  precision    recall  f1-score   support

          -1       0.46      0.67      0.54      1881
           1       0.95      0.88      0.91     12473

    accuracy                           0.85     14354
   macro avg       0.70      0.78      0.73     14354
weighted avg       0.88      0.85      0.86     14354

    =====
    

    SVM :
    =====
    Macro F1-score : 0.7147905866623758
    Micro F1-score : 0.8521666434443361
    Macro ROC-AUC: 0.7465420790033533
    Weighted ROC-AUC: 0.7465420790033533
    Classification report :
                  precision    recall  f1-score   support

          -1       0.45      0.60      0.52      1881
           1       0.94      0.89      0.91     12473

    accuracy                           0.85     14354


(({'-1': {'precision': 0.4580457682528151,
    'recall': 0.670388091440723,
    'f1-score': 0.5442382391022874,
    'support': 1881},
   '1': {'precision': 0.9465563313507456,
    'recall': 0.8803816243085064,
    'f1-score': 0.912270499293844,
    'support': 12473},
   'accuracy': 0.8528633133621291,
   'macro avg': {'precision': 0.7023010498017803,
    'recall': 0.7753848578746148,
    'f1-score': 0.7282543691980656,
    'support': 14354},
   'weighted avg': {'precision': 0.8825401428884906,
    'recall': 0.8528633133621291,
    'f1-score': 0.8640422227562713,
    'support': 14354},
   'macro_auc': 0.7753848578746146,
   'micro_auc': 0.7753848578746146},
  {'-1': {'precision': 0.4520111509358821,
    'recall': 0.6034024455077087,
    'f1-score': 0.5168488160291439,
    'support': 1881},
   '1': {'precision': 0.9370092037490501,
    'recall': 0.8896817124989979,
    'f1-score': 0.9127323572956079,
    'support': 12473},
   'accuracy': 0.8521666434443361,
   'macro avg': {'precision': 