# Backlog

- [X?] Le tester avec un petit modèle
- [X] LDA + Topic analysis
- [?] TFIDF + LogisticRegression, SGDClassifier
- [X] BERT
- [_] 
- [_] Target preprocessing

In [None]:
%load_ext autoreload
%autoreload 2
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from cuml.model_selection import train_test_split
from src.dataset import Dataset
import matplotlib.pyplot as plt
from cuml.naive_bayes import MultinomialNB
import nltk
import torch
import cuml
import cudf
import cupy as cp
import numpy as np
import mlflow
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from transformers import Trainer, TrainingArguments
import logging
import os

logging.getLogger().setLevel(logging.ERROR)

plt.style.use(['ggplot', 'https://raw.githubusercontent.com/AlanBlanchet/matplotlib_styles/master/vscode_blue.mplstyle'])

os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

mlflow.set_tracking_uri("../mlruns/")
mlflow.autolog()

RUN_ANIMATION_CELLS = False
RUN_IGNORABLE = False
RUN_ALL_DATA = False
RUN_VIZ = False

In [None]:
short_topics = Dataset("topics1", n=5000)
all_topics = Dataset("topics1")
topics = all_topics if RUN_ALL_DATA else short_topics
topics.df.head()

Voici les types d'approches à tester :

- Bag of Words (BoW) pour nous
- Word Embeddings : Word2Vec

# Itération 1

Le but est de commencer à effectuer des prédictions le plus vite possible pour ensuite améliorer dans la prochaine itération. Ainsi, même si j'ai déjà remarquer quelque problèmes dans mon dataset, je vais faire des prédictions

## Bag of Words - TF IDF

Dans cet partie on va effectuer un bag of words avec tous les mots disponibles. 

On va ensuite appliquer l'algorithme du TF IDF afin d'obtenir nos vecteurs one hot encodés correspondant aux similitudes entre les titres. Ainsi on pourra visualiser la proximité des phrases.

In [None]:
f = nltk.FreqDist()
for t in topics.df["title"].to_pandas():
    f.update(t.split(" "))
print(len(f.items()))

On a maintenant des mots uniques à notre disposition pour effectuer nos prédictions. Or pour le moment on ne prédit rien. Commençons simplement par une visualisation des mots les plus importants de notre liste

In [None]:
f.most_common(10)

In [None]:
q1 = np.quantile(sorted(f.values()), 0.25)
to_remove = {k:v for k,v in f.items() if v <= q1}
len(to_remove)

In [None]:
topics.df["short_title"] = topics.df["title"].to_pandas().parallel_apply(lambda x: ' '.join([t for t in x.split(" ") if t not in to_remove]))
if RUN_IGNORABLE:
    topics.df["short_title"].head()

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=3)

labels = cudf.Series(topics.df["target"].str.split("|").to_pandas().parallel_apply(lambda x: x[0]))
labels_counts = labels.value_counts()
# Get top labels - also a memory saver
q = np.quantile(labels_counts.values, 0.98)
over_labels = labels_counts[labels_counts > q].index
are_labels_in = labels.isin(over_labels)

y_labels = labels[are_labels_in].reset_index(drop=True).to_numpy()
y = topics.label2id(y_labels)
X = topics.df.loc[are_labels_in,"short_title"].reset_index(drop=True)

X = vectorizer.fit_transform(X)
X.shape

In [None]:
# On utilise d'abord le premier target
X_train, X_test, y_train, y_test = train_test_split(cudf.DataFrame(X.toarray()), cudf.Series(np.array(y, dtype=np.float64)), test_size=0.2, stratify=y)
# Save memory
X_train = cp.sparse.csr_matrix(X_train.values)
X_test = cp.sparse.csr_matrix(X_test.values)

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [None]:
y_pred = naive_bayes.predict(X_test.toarray()).tolist()

if RUN_VIZ:
    disp = ConfusionMatrixDisplay.from_predictions(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred))
    plt.title("< Q[0.98] target confusion matrix")
    plt.xticks(rotation=45, ha='right');
    plt.grid(False)

In [None]:
if RUN_VIZ:
    print(classification_report(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred), zero_division=0))

Ici on remarque que beaucoup de prédictions sont faites pour le langage "C#" et que son score n'est pas terrible.

Cela peut s'expliquer par le fait que le titre des questions soit trop générique ou présente un concept de code qui peut s'appliquer dans différents langage. Ex: "Comment ajouter un élément à un tableau ?"

On ne pourrait donc pas déterminer le tag avec uniquement le titre.

Visualisons ces données

In [None]:
from src.tsne import tsne

if RUN_ANIMATION_CELLS:
    tsne("text_tsne", X, y, topics.id2label(y))


Faisons la même chose mais pour les descriptions ("text")

In [None]:
f = nltk.FreqDist()
for t in topics.df["text"].to_pandas():
    f.update(t.split(" "))


In [None]:
f = nltk.FreqDist()
for t in topics.df["text"].to_pandas():
    f.update(t.split(" "))

# Memory error if too much cols
q = np.quantile(sorted(f.values()), 0.98)
to_remove = {k:v for k,v in f.items() if v <= q}

topics.df["short_text"] = topics.df["text"].to_pandas().parallel_apply(lambda x: ' '.join([t for t in x.split(" ") if t not in to_remove]))

vectorizer = TfidfVectorizer(min_df=3)

X = topics.df.loc[are_labels_in,"short_text"].reset_index(drop=True)
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(cudf.DataFrame(X.toarray()), cudf.Series(np.array(y, dtype=np.float64)), test_size=0.2, stratify=y)
X_train = cp.sparse.csr_matrix(X_train.values)
X_test = cp.sparse.csr_matrix(X_test.values)

In [None]:
naive_bayes.fit(X_train, y_train)
y_pred = naive_bayes.predict(X_test.toarray()).tolist()

if RUN_VIZ:
    disp = ConfusionMatrixDisplay.from_predictions(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred))
    plt.title("< Q[0.98] target confusion matrix")
    plt.xticks(rotation=45, ha='right');
    plt.grid(False)

In [None]:
if RUN_VIZ:
    print(classification_report(topics.id2label(y_test.to_numpy()), topics.id2label(y_pred), zero_division=0))

In [None]:
from src.tsne import tsne

if RUN_ANIMATION_CELLS:
    tsne("text_tsne", X, y, topics.id2label(y))

In [None]:
from gensim.models import LdaModel, Word2Vec, LdaSeqModel
from gensim.corpora import Dictionary

topics.to("cpu")
docs = topics.df["short_text"].str.split(" ").values

dictionary = Dictionary(docs)

corpus = [dictionary.doc2bow(doc) for doc in docs]

len(dictionary), len(corpus)

In [None]:
if RUN_VIZ:
    model = LdaModel(
        corpus=corpus,
        num_topics=20,
        id2word=dictionary
    )
    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)

In [None]:
if RUN_VIZ:
    all_topics.to("cpu")
    docs = all_topics.df["short_text"].str.split(" ").values

    dictionary = Dictionary(docs)

    corpus = [dictionary.doc2bow(doc) for doc in docs]

    model = LdaModel(
        corpus=corpus,
        num_topics=20,
        id2word=dictionary
    )

    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)

# BERT

In [None]:
del X, X_train, X_test, y_train, y_test

In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
data = topics.df.loc[are_labels_in.to_numpy()].reset_index(drop=True)

In [None]:
from tqdm.contrib.concurrent import thread_map

def preprocess(data):
    title, label = data

    encoded = tokenizer(title, padding="max_length", truncation=True, max_length=50)

    # Labels is the name that BERT uses
    encoded["labels"] = label

    return encoded

encoding = thread_map(preprocess, zip(data["title"], topics.label2id(y_labels)))

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="label_classification", 
                                                           num_labels=len(topics._id2label),
                                                           id2label=topics._id2label,
                                                           label2id=topics._label2id)
# model = AutoModelForSequenceClassification.from_pretrained(model_name,
#                                                            num_labels=len(topics._id2label))

In [None]:
model_name = model_name.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned",
    num_train_epochs=3,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    optim="adamw_torch"
)

In [None]:
import datasets

# Transform to the Dataset API
dataset = datasets.Dataset.from_list(encoding)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
from transformers import DataCollatorForTokenClassification

# Use a Collator to create batches for encodings
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Get split datasets
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
    # compute_metrics=compute_metrics
)

In [None]:
with mlflow.start_run():
    trainer.train()