# Backlog

- [X?] Le tester avec un petit modèle
- [X] LDA + Topic analysis
- [?] TFIDF + LogisticRegression, SGDClassifier
- [X] BERT
- [_] 
- [_] Target preprocessing

In [None]:
%load_ext autoreload
%autoreload 2
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from src.dataset import Dataset
import matplotlib.pyplot as plt
from cuml.naive_bayes import MultinomialNB
from peft import LoraConfig, TaskType
from pathlib import Path
import nltk
import torch
import cudf
import cupy as cp
import numpy as np
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from transformers import AutoTokenizer
import logging
import os

logging.getLogger().setLevel(logging.ERROR)

# 'https://raw.githubusercontent.com/AlanBlanchet/matplotlib_styles/master/vscode_blue.mplstyle'
plt.style.use(['ggplot'])

os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

RUN_ANIMATION_CELLS = False
RUN_IGNORABLE = False
RUN_ALL_DATA = True
RUN_HEAVY = False
RUN_VIZ = False

In [None]:
short_topics = Dataset("topics1", n=3500)
all_topics = Dataset("topics1")
topics = all_topics if RUN_ALL_DATA else short_topics
topics.df.head()

Voici les types d'approches à tester :

- Bag of Words (BoW) pour nous
- Word Embeddings : Word2Vec

## Prédiction - TF IDF

Le but est de commencer à effectuer des prédictions le plus vite possible pour ensuite améliorer dans la prochaine itération. Ainsi, même si j'ai déjà remarquer quelque problèmes dans mon dataset, je vais faire des prédictions

Dans cet partie on va effectuer un bag of words avec tous les mots disponibles. 

On va ensuite appliquer l'algorithme du TF IDF afin d'obtenir nos vecteurs one hot encodés correspondant aux similitudes entre les titres. Ainsi on pourra visualiser la proximité des phrases.

In [None]:
f = nltk.FreqDist()
for t in topics.df["title"]:
    f.update(t.split(" "))
print(len(f.items()))

On a maintenant des mots uniques à notre disposition pour effectuer nos prédictions. Or pour le moment on ne prédit rien. Commençons simplement par une visualisation des mots les plus importants de notre liste

In [None]:
f.most_common(10)

In [None]:
q1 = np.quantile(sorted(f.values()), 0.25)
to_remove = {k:v for k,v in f.items() if v <= q1}
len(to_remove)

In [None]:
topics.df["short_title"] = topics.df["title"].parallel_apply(lambda x: ' '.join([t for t in x.split(" ") if t not in to_remove]))
if RUN_IGNORABLE:
    topics.df["short_title"].head()

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=3)

labels = topics.df["target1"]
labels_counts = labels.value_counts()
# Get top labels - also a memory saver
q = np.quantile(labels_counts.values, 0.98)
over_labels = labels_counts[labels_counts > q].index
are_labels_in = labels.isin(over_labels)

y_labels = labels[are_labels_in].reset_index(drop=True).to_numpy()
y = topics.label2id(y_labels)
X = topics.df.loc[are_labels_in,"short_title"].reset_index(drop=True)

X = vectorizer.fit_transform(cudf.Series(X))
X.shape

## Split

In [None]:
# On utilise d'abord le premier target
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), np.array(y, dtype=np.float64), test_size=0.2, stratify=y)
# Save memory
X_train = cp.sparse.csr_matrix(cp.array(X_train))
X_test = cp.sparse.csr_matrix(cp.array(X_test))

## MultinomialNB

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [None]:
y_pred = naive_bayes.predict(X_test.toarray()).tolist()

if RUN_VIZ:
    disp = ConfusionMatrixDisplay.from_predictions(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred))
    plt.title("< Q[0.98] target confusion matrix")
    plt.xticks(rotation=45, ha='right');
    plt.grid(False)

In [None]:
if RUN_VIZ:
    print(classification_report(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred), zero_division=0))

Ici on remarque que beaucoup de prédictions sont faites pour le langage "C#" et que son score n'est pas terrible.

Cela peut s'expliquer par le fait que le titre des questions soit trop générique ou présente un concept de code qui peut s'appliquer dans différents langage. Ex: "Comment ajouter un élément à un tableau ?"

On ne pourrait donc pas déterminer le tag avec uniquement le titre.

Visualisons ces données

## t-SNE

In [None]:
from src.tsne import tsne, tsne_anim

score = 0

if RUN_ANIMATION_CELLS:
    if RUN_HEAVY:
       tsne_anim("title_tsne", X[:10000].todense(), y[:10000], topics.id2label(y)[:10000])
    else:
        _, score = tsne(X[:10000].todense(), y[:10000], topics.id2label(y)[:10000])
        del _
        print(score)

## Text

Faisons la même chose mais pour les descriptions ("text")

In [None]:
f = nltk.FreqDist()
for t in topics.df["text"]:
    f.update(t.split(" "))


In [None]:
f = nltk.FreqDist()
for t in topics.df["text"]:
    f.update(t.split(" "))

# Memory error if too much cols
q = np.quantile(sorted(f.values()), 0.98)
to_remove = {k:v for k,v in f.items() if v <= q}

topics.df["short_text"] = topics.df["text"].parallel_apply(lambda x: ' '.join([t for t in x.split(" ") if t not in to_remove]))

vectorizer = TfidfVectorizer(min_df=3)

X = topics.df.loc[are_labels_in,"short_text"].reset_index(drop=True)
X = vectorizer.fit_transform(cudf.Series(X))

X_train, X_test, y_train, y_test = train_test_split(X.toarray(), np.array(y, dtype=np.float64), test_size=0.2, stratify=y)
X_train = cp.sparse.csr_matrix(cp.array(X_train))
X_test = cp.sparse.csr_matrix(cp.array(X_test))

In [None]:
naive_bayes.fit(X_train, y_train)
y_pred = naive_bayes.predict(X_test.toarray()).tolist()

if RUN_VIZ:
    disp = ConfusionMatrixDisplay.from_predictions(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred))
    plt.title("< Q[0.98] target confusion matrix")
    plt.xticks(rotation=45, ha='right');
    plt.grid(False)

In [None]:
if RUN_VIZ:
    print(classification_report(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred), zero_division=0))

In [None]:
score = 0

if RUN_ANIMATION_CELLS:
    if RUN_HEAVY:
        tsne_anim("text_tsne", X[:10000].todense(), y[:10000], topics.id2label(y)[:10000])
    else:
        _, score = tsne(X.todense(), y, topics.id2label(y))
        del _
        print(score)

## LDA & Word2Vec

In [None]:
from gensim.models import LdaModel, Word2Vec
from gensim.corpora import Dictionary

topics.to("cpu")
docs = topics.df["short_text"].str.split(" ").values

dictionary = Dictionary(docs)

corpus = [dictionary.doc2bow(doc) for doc in docs]

len(dictionary), len(corpus)

In [None]:
if RUN_VIZ:
    model = LdaModel(
        corpus=corpus,
        num_topics=20,
        id2word=dictionary
    )
    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)

In [None]:
if RUN_VIZ:
    all_topics.to("cpu")
    docs = all_topics.df["short_text"].str.split(" ").values

    dictionary = Dictionary(docs)

    corpus = [dictionary.doc2bow(doc) for doc in docs]

    model = LdaModel(
        corpus=corpus,
        num_topics=20,
        id2word=dictionary
    )

    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)

In [None]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24

### Test on title + text with Word2Vec

In [None]:
sentences = topics.df["title"] + topics.df["text"]
sentences_split = [sentence.split(" ") for sentence in sentences]

In [None]:
w2v = Word2Vec(window=5, min_count=4, workers=10, vector_size=300, seed=0)
w2v.build_vocab(sentences_split)
w2v.train(sentences_split, total_examples=w2v.corpus_count, epochs=100)
vecs = w2v.wv

Chaque mot est assigné à un vecteur de dimension 300.

On peut donc maintenant considérer que nos phrases sont elles-mêmes des embeddings avec un vecteur de dimension 300. Pour une phrase on prendra la moyenne de ses vecteurs words.

In [None]:
emb = np.zeros((len(sentences_split), w2v_size))
for i, sentence in enumerate(sentences_split):
    vec = np.array([vecs[word] for word in sentence if word in vecs])
    if len(vec) == 0:
        continue
    emb[i] = vec.mean(axis=0)


In [None]:
X = emb
y = topics.label2id(topics.df["target1"])

In [None]:
tsne(torch.tensor(X), y, topics.id2label(y), show=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, jaccard_score

model = LogisticRegression(n_jobs=8, random_state=0, max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
jaccard = jaccard_score(y_test, y_pred, average="weighted")
print(f"LogisticRegression {accuracy=} {jaccard=}")

model = SGDClassifier(n_jobs=8, max_iter=2000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
jaccard = jaccard_score(y_test, y_pred, average="weighted")
print(f"SGDClassifier {accuracy=} {jaccard=}")

# BERT

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Transform to the Dataset API
dataset = topics.to_datasets(["target1"], {"target1": "labels"}, tokenizer=tokenizer, sentence_length=128)

peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False,
    r=256,
    lora_alpha=32,
    lora_dropout=0,
)

trainer = short_topics.trainer(model_name, dataset, peft=peft_config, batch_size=8, lr=1e-3)

In [None]:
trainer.train()

Le reste du script est lancé directement avec l'interpréteur python car la mémoire GPU / RAM est instable avec les notebooks pour une raison qui m'est inconnue.

Les fichiers des modèles sont présents dans src/run