# Index
* [Import](#index)
* [Tokenization](#Tokenization)
* [CountVectorizer](#CountVectorizer)
* [PreprocessPipeline](#PreprocessPipeline)
* [OvsRest](#OvsRest)
    - [LogisticRegression](#LogisticRegression)
    - [SVC](#SVC)
* [Word2Vec](#Word2Vec)
* [BERT](#BERT)
* [USE](#USE)

In [1]:
import time
import json
import os

# from joblib import dump, load, Memory
import joblib
import dill
import weakref

# import warnings

import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt

# from gensim import (
#     corpora,
# )  # https://pypi.org/project/gensim/ #https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html#sphx-glr-auto-examples-core-run-topics-and-transformations-py
from sklearn import (
    metrics,
    feature_extraction,
    linear_model,
    model_selection,
    preprocessing,
    cluster,
    decomposition,
    multiclass,
    svm,
    pipeline,
    exceptions,
    manifold,
)
from sklearn.utils._testing import ignore_warnings
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFAutoModel,
    AutoModel,
    BertTokenizer,
    TFBertModel,
    BertConfig,
)  # BertModel
import transformers
from tokenizers import BertWordPieceTokenizer


# import nlpk

2023-01-04 21:53:37.146605: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-04 21:53:37.147751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-04 21:53:37.148427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-04 21:53:37.150735: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

test https://scikit-learn.org/stable/modules/multiclass.html autre stategie que OneVsRest
https://keras.io/examples/nlp/text_extraction_with_bert/

In [2]:
pip install -q jupyter-black jupyter

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# for black formating
import jupyter_black  # pip install jupyter-black jupyter

jupyter_black.load()
# ??jupyter_black.load #show config

In [4]:
df = pd.read_csv(
    "/kaggle/input/p5-nlp/stackoverflowCleaned.csv", index_col=0, low_memory=False
)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118442 entries, 0 to 118441
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Id           118442 non-null  int64 
 1   Title        118442 non-null  object
 2   Body         118442 non-null  object
 3   0            118440 non-null  object
 4   1            118383 non-null  object
 5   2            118292 non-null  object
 6   3            118292 non-null  object
 7   4            76389 non-null   object
 8   5            4 non-null       object
 9   Title_clean  118389 non-null  object
 10  Code         88202 non-null   object
 11  Body_clean   118392 non-null  object
dtypes: int64(1), object(11)
memory usage: 11.7+ MB


In [5]:
LONG = False
description = {
    "title": "Keras Sequential Model with Word2Vec (kerasPipeline)",
    "CountVectorizer": {"min_df": 0.00001, "max_df": 0.995, "max_features": None},
    "StandardScaler": {"actif": False},
    "TruncatedSVD": {"actif": False, "n_components": 1_000},
    "LogisticRegression": {"actif": False},
    "TfidfOvRSVC": {"actif": False, "max_iter": 100},
    "kerasPipeline": {
        "actif": True,
        "TextVectorization__max_tokens": 210_000,  # 20_000,
        "TextVectorization__output_sequence_length": 100,  # 20
        "Embedding__output_dim": 500,
        "Word2Vec__min_count": 5,
        "Word2Vec__window": 25,
        "Word2Vec__epochs": 150,  # 100,
        "fit__epochs": 15,  # 15
    },
    "Word2Vec": {
        "actif": False,
        "min_count": 1,
        "size": 500,
        "window": 10,
        "epochs": 100,
        "maxlen": 50,
        "max_iter": 6_000,
    },
    "BERT": {
        "actif": False,
        "max_length": 150,
        "epochs": 5,
    },
}

In [6]:
# # ultra fast
# description["kerasPipeline"]["fit__epochs"] = 1
# description["kerasPipeline"]["TextVectorization__max_tokens"] = 100
# description["kerasPipeline"]["TextVectorization__output_sequence_length"] = 5
# description["kerasPipeline"]["Word2Vec__epochs"] = 1
# description["kerasPipeline"]["Embedding__output_dim"] = 10

In [7]:
class colors:
    PURPLE = "\033[95m"
    BLUE = "\033[94m"
    CYAN = "\033[96m"
    GREEN = "\033[92m"
    YELLOW = "\033[93m"
    RED = "\033[91m"
    ENDC = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"

# Tokenization

In [8]:
from ipywidgets import interact


@interact(i=(0, df.shape[0] - 1))
def test_tr_print(i):
    print(
        f'Title:{df["Title"][i]}\n{"*"*30}\nTitle_clean:{df["Title_clean"][i]}\n{"*"*30}\nBody:{df["Body"][i]}\n{"*"*30}\nBody_clean:{df["Body_clean"][i]}\n{"*"*30}\nCode:{df["Code"][i]}\n{"*"*30}\nTags:{df.loc[i,"0"]} {df.loc[i,"1"]} {df.loc[i,"2"]} {df.loc[i,"3"]} {df.loc[i,"4"]}'
    )

interactive(children=(IntSlider(value=59220, description='i', max=118441), Output()), _dom_classes=('widget-in…

In [9]:
sum(df["Title_clean"].isna())

53

In [10]:
df[df["Title_clean"].isna()][
    ["Title", "Title_clean", "Body", "Body_clean", "0", "1", "2", "3", "4"]
].head(5)

Unnamed: 0,Title,Title_clean,Body,Body_clean,0,1,2,3,4
302,Is < faster than <=?,,<p>Is <code>if (a &lt; 901)</code> faster than...,example performance change loop code suppose g...,c++,c,performance,assembly,relational-operators
393,What is setup.py?,,<p>What is <code>setup.py</code> and how can i...,configure,python,pypi,setup.py,python-packaging,
1437,What is related_name used for?,,<p>What is the <code>related_name</code> argum...,argument manytomanyfield foreignkey field exam...,python,django,foreign-keys,many-to-many,
4062,"What is ""android:allowBackup""?",,"<p>Since the <a href=""http://tools.android.com...",adt preview version version lint warning tell ...,android,adt,compiler-warnings,android-lint,android-backup-service
4870,Why is 0 < -0x80000000?,,<p>I have below a simple program:</p>\n\n<pre>...,program condition if(bal int32_min work change...,c,signed,numeric-limits,numeric-conversion,


In [11]:
print(sum(df["Body_clean"].isna()))
df[df["Body_clean"].isna()][
    ["Title", "Title_clean", "Body", "Body_clean", "Code"]
].head(5)

50


Unnamed: 0,Title,Title_clean,Body,Body_clean,Code
5,What and where are the stack and heap?,stack heap,<ul>\n<li>What are the stack and heap?</li>\n<...,,
168,What is The Rule of Three?,rule,<ul>\n<li>What does <em>copying an object</em>...,,
828,Why are these constructs using pre and post-in...,construct pre behavior,<pre><code>#include &lt;stdio.h&gt;\n\nint mai...,,#include <stdio.h>\n\nint main(void)\n{\n in...
1207,How to use ADB Shell when Multiple Devices are...,use adb shell multiple device connect fail err...,<pre><code>$ adb --help\n</code></pre>\n<hr />...,,$ adb --help\n -s SERIAL use device with give...
1640,What are the advantages of list initialization...,advantage list initialization brace,<pre><code>MyClass a1 {a}; // clearer and ...,,MyClass a1 {a}; // clearer and less error-...


In [12]:
len(df[df["Body_clean"].isna() & df["Title_clean"].isna()])

0

In [13]:
sum((df["Title_clean"] + df["Body_clean"]).isna())

103

In [14]:
def tag_is_in(df: pd.DataFrame(), tag: str, nb_cols: int = 6):
    return eval("|".join(f'(df["{i}"] == "{tag}")' for i in range(0, nb_cols)))


def tags_are_in(df: pd.DataFrame(), tags: list):
    return pd.DataFrame({tag: tag_is_in(df, tag) for tag in tags})


def save_score(
    y_true, y_pred, target_names: list, name=None, zero_division=0
) -> pd.DataFrame():
    scores = pd.DataFrame(
        metrics.classification_report(
            y_true,
            y_pred,
            target_names=target_names,
            zero_division=zero_division,
            output_dict=True,
        )
    ).T
    if name is not None:
        scores.to_csv(f"/kaggle/working/{name}_score.csv")
    return scores


# tags_are_in(df, ["javascript","java"])
tags_are_in(
    df[10001:15050], ["javascript", "java", "c#", "python", "html", "git", "php"]
)

Unnamed: 0,javascript,java,c#,python,html,git,php
10001,True,False,False,False,True,False,False
10002,False,True,False,False,False,False,False
10003,False,False,True,False,False,False,False
10004,False,False,False,False,True,False,False
10005,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
15045,False,False,False,False,False,False,False
15046,False,False,False,False,False,False,False
15047,True,False,False,False,False,False,False
15048,False,False,False,False,False,False,False


In [15]:
tags = df[["0", "1", "2", "3", "4", "5"]].stack()
target_names = [
    tag
    for tag, freq in tags.reset_index(drop=True).value_counts(normalize=True).items()
    if freq > 0.002
]
if not LONG:
    target_names = target_names[0:20] + ["git"]
target_names

['java',
 'c#',
 'javascript',
 'python',
 'android',
 'c++',
 'ios',
 'html',
 'php',
 '.net',
 'jquery',
 'css',
 'objective-c',
 'c',
 'sql',
 'iphone',
 'asp.net',
 'mysql',
 'linux',
 'node.js',
 'git']

In [16]:
# Enregistrement des paramétres avec la liste des tags a predire
description["target_names"] = target_names
with open("/kaggle/working/description.json", "w") as outfile:
    json.dump(description, outfile)

## Separation train/test

In [17]:
df["Token"] = df["Title_clean"].fillna("") + df["Body_clean"].fillna("")
print(
    f"{len(df[[len(token.split()) > 1 for token in df['Token']]])/len(df):.5%} des questions ont plus d'un tokens"
    if len(df[df["Token"] == ""]) == 0
    else f"{len(df[df['Token'] == ''])} questions n'ont pas de token"
)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df["Token"], tags_are_in(df, target_names), random_state=42
)

99.99409% des questions ont plus d'un tokens


## Approche naïve pour comparer les résultat:
    - Valeur la plus fréquente (False)
    - Si le tag est présent dans la question

In [18]:
save_score(
    y_true=y_test,
    y_pred=y_test.replace([True], False),
    target_names=target_names,
    zero_division=1,  # because TruePositif=0
)

Unnamed: 0,precision,recall,f1-score,support
java,1.0,0.0,0.0,3558.0
c#,1.0,0.0,0.0,3473.0
javascript,1.0,0.0,0.0,3356.0
python,1.0,0.0,0.0,2612.0
android,1.0,0.0,0.0,2045.0
c++,1.0,0.0,0.0,1964.0
ios,1.0,0.0,0.0,1840.0
html,1.0,0.0,0.0,1755.0
php,1.0,0.0,0.0,1627.0
.net,1.0,0.0,0.0,1569.0


Avec prédire toujours vrai

In [19]:
save_score(
    y_true=y_test, y_pred=y_test.replace([False], True), target_names=target_names
)

Unnamed: 0,precision,recall,f1-score,support
java,0.120158,1.0,0.214538,3558.0
c#,0.117287,1.0,0.20995,3473.0
javascript,0.113336,1.0,0.203598,3356.0
python,0.08821,1.0,0.16212,2612.0
android,0.069062,1.0,0.129201,2045.0
c++,0.066327,1.0,0.124402,1964.0
ios,0.062139,1.0,0.117007,1840.0
html,0.059269,1.0,0.111905,1755.0
php,0.054946,1.0,0.104168,1627.0
.net,0.052987,1.0,0.100641,1569.0


### Methode naive le tag est dans la question:

In [20]:
test_str = X_test.str
pred_is_in = pd.DataFrame(
    {
        target_name: test_str.contains(
            f" {target_name} ", case=False, regex=False
        ).values
        for target_name in target_names
    }
)
pred_is_in.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29601,29602,29603,29604,29605,29606,29607,29608,29609,29610
java,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
c#,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
javascript,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
python,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
android,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
c++,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ios,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
html,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
php,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
.net,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
score_naive_is_in = save_score(y_test, pred_is_in, target_names, "naive_is_in")
score_naive_is_in.sort_values(by="f1-score", ascending=False)

Unnamed: 0,precision,recall,f1-score,support
git,0.716846,0.729927,0.723327,274.0
mysql,0.723949,0.578102,0.642857,685.0
python,0.878273,0.475115,0.616646,2612.0
php,0.827292,0.476951,0.605068,1627.0
android,0.851163,0.447433,0.586538,2045.0
jquery,0.796804,0.453836,0.578293,1538.0
css,0.71467,0.469496,0.566702,1131.0
c++,0.805702,0.388493,0.524218,1964.0
java,0.766089,0.347948,0.478547,3558.0
linux,0.458265,0.411765,0.433772,680.0


In [22]:
for column in y_train.columns:
    t_train = y_train[column].value_counts(normalize=True)
    t_test = y_test[column].value_counts(normalize=True)
    print(f"{colors.BOLD}{column}{colors.ENDC}")
    print(
        f"\t Sur entrainement {colors.GREEN}{t_train[True]:.2%}{colors.ENDC} positif {colors.RED}{t_train[False]:.2%}{colors.ENDC} negatif"
    )
    print(
        f"\t Sur test {colors.GREEN}{t_test[True]:.2%}{colors.ENDC} positif {colors.RED}{t_test[False]:.2%}{colors.ENDC} negatif"
    )
    print("\n")

[1mjava[0m
	 Sur entrainement [92m12.05%[0m positif [91m87.95%[0m negatif
	 Sur test [92m12.02%[0m positif [91m87.98%[0m negatif


[1mc#[0m
	 Sur entrainement [92m11.80%[0m positif [91m88.20%[0m negatif
	 Sur test [92m11.73%[0m positif [91m88.27%[0m negatif


[1mjavascript[0m
	 Sur entrainement [92m11.77%[0m positif [91m88.23%[0m negatif
	 Sur test [92m11.33%[0m positif [91m88.67%[0m negatif


[1mpython[0m
	 Sur entrainement [92m8.61%[0m positif [91m91.39%[0m negatif
	 Sur test [92m8.82%[0m positif [91m91.18%[0m negatif


[1mandroid[0m
	 Sur entrainement [92m7.00%[0m positif [91m93.00%[0m negatif
	 Sur test [92m6.91%[0m positif [91m93.09%[0m negatif


[1mc++[0m
	 Sur entrainement [92m6.63%[0m positif [91m93.37%[0m negatif
	 Sur test [92m6.63%[0m positif [91m93.37%[0m negatif


[1mios[0m
	 Sur entrainement [92m6.25%[0m positif [91m93.75%[0m negatif
	 Sur test [92m6.21%[0m positif [91m93.79%[0m negatif


[1mhtml[0m
	

# CountVectorizer

In [23]:
best_min_df = description["CountVectorizer"]["min_df"]
best_max_df = description["CountVectorizer"]["max_df"]
for min_df in [0.01, 0.05, 0.005, 0.0005, 0.00001] if LONG else [best_min_df]:
    for max_df in [0.95, 0.995, 0.9995] if LONG else [best_max_df]:
        print(f"\n{'*'*30}\n pour min_df:{min_df}; max_df:{max_df}")
        count_vectorizer = feature_extraction.text.CountVectorizer(
            min_df=min_df, max_df=max_df, max_features=None
        )
        #         pipe = pipeline.make_pipeline(feature_extraction.text.CountVectorizer(min_df=min_df, max_df=max_df, max_features=None), feature_extraction.text.TfidfTransformer( norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False))
        #         feat_train = pipe.transform(X_train["Token"])
        #         print(feat_train.shape)

        token_vectors = count_vectorizer.fit_transform(df["Token"])
        print(
            f"\ttokens unique:\n\t\t{token_vectors.shape[1]} dans title et body regroupé"
        )


******************************
 pour min_df:1e-05; max_df:0.995
	tokens unique:
		53999 dans title et body regroupé


### Test reduction dimension
avec n_components=1_000 : 10% de la variance éxpliqué


In [24]:
%%time
if LONG:
    truncSvd = decomposition.TruncatedSVD(n_components=1000).fit(
        preprocessing.StandardScaler(with_mean=False).fit_transform(token_vectors)
    )
    plt.figure(figsize=(30, 15))
    scree = truncSvd.explained_variance_ratio_ * 100
    plt.bar(np.arange(len(scree)) + 1, scree)
    plt.plot(np.arange(len(scree)) + 1, scree.cumsum(), c="red", marker="o")
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage variance expliquée")
    plt.title("Eboulis des valeurs propres")
    plt.show()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


# PreprocessPipeline

In [25]:
pipeline_tfidf = pipeline.Pipeline(
    [
        (
            "count",
            feature_extraction.text.CountVectorizer(
                min_df=best_min_df, max_df=best_max_df, max_features=None
            ),
        ),
        (
            "tfidf",
            feature_extraction.text.TfidfTransformer(
                norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False
            ),
        ),
        (
            "scale",
            preprocessing.StandardScaler(with_mean=False)
            if description["StandardScaler"]["actif"]
            else "passthrough",
        ),
        (
            "dimReduction",
            decomposition.TruncatedSVD(
                n_components=description["TruncatedSVD"]["n_components"]
            )
            if description["TruncatedSVD"]["actif"]
            else "passthrough",
        ),
    ]
)
## @TODO cache
# memory = joblib.Memory(location="/kaggle/working/cache_dir", verbose=0)
# pipeline_tfidf = memory.cache(pipeline_tfidf)

# OvsRest

## LogisticRegression

In [26]:
%%time
if description["LogisticRegression"]["actif"]:
    pipeline_logisticReg = pipeline.Pipeline(
        [
            ("tfidf", pipeline_tfidf),
            (
                "OvRLogisticReg",
                multiclass.OneVsRestClassifier(
                    linear_model.LogisticRegression(random_state=42)
                ),
            ),
        ]
    )
    with ignore_warnings(category=exceptions.ConvergenceWarning):
        pipeline_logisticReg.fit(X_train, y_train)

    # prediction sur jeu de test
    pred_lr = pipeline_logisticReg.predict(X_test)
    scores_lr = save_score(y_test, pred_lr, target_names, "LogisticRegression")

    # enregistrement model et score
    joblib.dump(pipeline_logisticReg, "LogisticRegression_model.joblib")
    scores_lr.sort_values(by="support", ascending=False)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


In [27]:
if description["LogisticRegression"]["actif"]:
    print(scores_lr.sort_values(by="precision", ascending=False))

In [28]:
if description["LogisticRegression"]["actif"]:
    print(scores_lr.sort_values(by="recall", ascending=False))

## SVC

In [29]:
pipeline_svc = pipeline.Pipeline(
    [
        ("tfidf", pipeline_tfidf),
        (
            "TfidfOvRSVC",
            multiclass.OneVsRestClassifier(
                svm.SVC(
                    cache_size=1080,
                    max_iter=description["TfidfOvRSVC"]["max_iter"],
                    random_state=42,
                )
            ),
        ),
    ]
)

In [30]:
%%time
if description["TfidfOvRSVC"]["actif"]:
    with ignore_warnings(category=exceptions.ConvergenceWarning):
        pipeline_svc.fit(
            X_train,
            y_train,
        )

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [31]:
%%time
if description["TfidfOvRSVC"]["actif"]:
    pred_svc = pipeline_svc.predict(X_test)
    print(pred_svc)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


### Score et enregistrement

In [32]:
if description["TfidfOvRSVC"]["actif"]:
    scores_svc = save_score(y_test, pred_svc, target_names, "TfidfOvRSVC")
    joblib.dump(pipeline_svc, "TfidfOvRestSvc_model.joblib")
    print(scores_svc.sort_values(by="precision", ascending=False))

# Word2Vec

## Keras Model

In [33]:
print(description["kerasPipeline"])

{'actif': True, 'TextVectorization__max_tokens': 210000, 'TextVectorization__output_sequence_length': 100, 'Embedding__output_dim': 500, 'Word2Vec__min_count': 5, 'Word2Vec__window': 25, 'Word2Vec__epochs': 150, 'fit__epochs': 15}


In [34]:
## https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
# because https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer deprecaded
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=description["kerasPipeline"]["TextVectorization__max_tokens"],
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int",
    output_sequence_length=description["kerasPipeline"][
        "TextVectorization__output_sequence_length"
    ],
)

In [35]:
%%time
if description["kerasPipeline"]["actif"]:
    vectorize_layer.adapt(X_train)

2023-01-04 21:54:11.602600: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


CPU times: user 6.79 s, sys: 454 ms, total: 7.24 s
Wall time: 5.56 s


In [36]:
def build_Word2Vec(X_train, params):
    print("Build & train Word2Vec model ...")
    X_train_token = X_train.str.split()
    w2v_model = gensim.models.Word2Vec(
        min_count=params["Word2Vec__min_count"],
        window=params["Word2Vec__window"],
        vector_size=params["Embedding__output_dim"],
        seed=42,
        workers=1,
    )
    w2v_model.build_vocab(X_train_token)
    w2v_model.train(
        X_train_token,
        total_examples=w2v_model.corpus_count,
        epochs=params["Word2Vec__epochs"],
    )
    model_vectors = w2v_model.wv
    print("Vocabulary size: %i" % len(model_vectors.index_to_key))
    print("Word2Vec trained")
    return model_vectors

In [37]:
%%time
if description["kerasPipeline"]["actif"]:
    model_vectors = build_Word2Vec(X_train, description["kerasPipeline"])
    print(
        f"{len(vectorize_layer.get_vocabulary())} {len(model_vectors.index_to_key)} {len(vectorize_layer.get_vocabulary())/len(model_vectors.index_to_key):.2%}"
    )
# embedding_matrix, vocab_size = create_embeding(
#    vectorize_layer.get_vocabulary(), build_Word2Vec(X_train, description["kerasPipeline"]), params=description["kerasPipeline"]
# )

Build & train Word2Vec model ...
Vocabulary size: 21381
Word2Vec trained
199578 21381 933.44%
CPU times: user 41min 30s, sys: 8.19 s, total: 41min 39s
Wall time: 41min 44s


In [38]:
%%time
if description["kerasPipeline"]["actif"]:
    print("Create Embedding matrix ...")
    embedding_matrix = np.asarray(
        [
            model_vectors[word]
            if word in model_vectors.index_to_key
            else np.zeros(description["kerasPipeline"]["Embedding__output_dim"])
            for word in vectorize_layer.get_vocabulary()
        ]
    )
    embedding_matrix.shape

Create Embedding matrix ...
CPU times: user 1min 57s, sys: 2.18 s, total: 2min
Wall time: 2min


In [39]:
# https://www.tensorflow.org/text/guide/word_embeddings
if description["kerasPipeline"]["actif"]:
    modelkerasPipeline = tf.keras.models.Sequential(
        [
            vectorize_layer,
            tf.keras.layers.Embedding(
                input_dim=len(vectorize_layer.get_vocabulary()),
                output_dim=description["kerasPipeline"]["Embedding__output_dim"],
                weights=[embedding_matrix],
                input_length=description["kerasPipeline"][
                    "TextVectorization__output_sequence_length"
                ],
            ),
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dense(21),
        ]
    )
    # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs") # for log create a dir
    modelkerasPipeline.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )

In [40]:
%%time
if description["kerasPipeline"]["actif"]:
    modelkerasPipeline.fit(
        X_train,
        y_train,
        epochs=description["kerasPipeline"]["fit__epochs"],
        # callbacks=[tensorboard_callback],
    )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 14min 27s, sys: 37.2 s, total: 15min 4s
Wall time: 19min 22s


In [41]:
%%time
if description["kerasPipeline"]["actif"]:
    pred_kerasword2vec_proba = modelkerasPipeline.predict(X_test)
    #     print(pred_kerasword2vec_proba)
    pred_kerasword2vec = pd.DataFrame(
        [
            {tag: pred_tag > 0 for pred_tag, tag in zip(pred, target_names)}
            for pred in pred_kerasword2vec_proba
        ]
    )
    print(pred_kerasword2vec.value_counts())

java   c#     javascript  python  android  c++    ios    html   php    .net   jquery  css    objective-c  c      sql    iphone  asp.net  mysql  linux  node.js  git  
False  False  False       False   False    False  False  False  False  False  False   False  False        False  False  False   False    False  False  False    False    9018
True   False  False       False   False    False  False  False  False  False  False   False  False        False  False  False   False    False  False  False    False    2457
False  False  False       True    False    False  False  False  False  False  False   False  False        False  False  False   False    False  False  False    False    2006
       True   False       False   False    False  False  False  False  False  False   False  False        False  False  False   False    False  False  False    False    1815
       False  False       False   True     False  False  False  False  False  False   False  False        False  False  False   False    F

In [42]:
if description["kerasPipeline"]["actif"]:
    scores_kerasword2vec = save_score(
        y_test,
        pred_kerasword2vec,
        target_names,
        "kerasPipeline",
    )
    print(scores_kerasword2vec.sort_values(by="f1-score", ascending=False))
    modelkerasPipeline.save("kerasPipeline")

              precision    recall  f1-score  support
android        0.846193  0.820538  0.833168   2045.0
git            0.888889  0.759124  0.818898    274.0
python         0.831476  0.714012  0.768280   2612.0
php            0.766520  0.641672  0.698561   1627.0
java           0.719683  0.663856  0.690643   3558.0
ios            0.720049  0.640217  0.677791   1840.0
css            0.738720  0.622458  0.675624   1131.0
jquery         0.721963  0.602731  0.656981   1538.0
javascript     0.681973  0.597437  0.636912   3356.0
node.js        0.777542  0.538123  0.636049    682.0
c++            0.676843  0.598269  0.635135   1964.0
micro avg      0.681115  0.587762  0.631004  33044.0
mysql          0.724335  0.556204  0.629232    685.0
weighted avg   0.676866  0.587762  0.627584  33044.0
macro avg      0.662632  0.557644  0.603700  33044.0
c#             0.633344  0.576447  0.603557   3473.0
html           0.552574  0.574929  0.563530   1755.0
sql            0.565272  0.512644  0.537673   

2023-01-04 22:57:37.024652: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


## Word2Vec puis OvR SVC

In [43]:
from kerasembedtransformerclass import KerasEmbedTransformer

# class KerasEmbedTransformer(BaseEstimator, TransformerMixin):
#     def init(self, params):
#         self.params = params
#         return self

#     def fit(self, X, y=None):
#         self.embed_model, self.tokenizer = self.create_keras_model(X)
#         return self

#     def transform(self, X, y=None):
#         x_sentences = keras.preprocessing.sequence.pad_sequences(
#             self.tokenizer.texts_to_sequences(X),
#             maxlen=self.params["maxlen"],
#             padding="post",
#         )
#         embeddings = self.embed_model.predict(x_sentences)
#         print("embedings shape ", embeddings.shape)
#         return embeddings

#     def save(self, filename):
#         joblib.dump(self.tokenizer, f"{filename}.tokenizer")
#         self.embed_model.save(
#             f"{filename}.model"
#         )  # This hack allows us to save the sklearn pipeline
#         self.embed_model = None
#         return self

#     def load(self, filename, params):
#         self.tokenizer = joblib.load(f"{filename}.tokenizer")
#         self.embed_model = keras.models.load_model(f"{filename}.model")
#         self.init(params)
#         return self

#     def create_embeding(self, word_index, model_vectors):
#         vocab_size = len(word_index) + 1
#         print(f"Number of unique words: {vocab_size}")
#         print("Create Embedding matrix ...")
#         embedding_matrix = np.zeros((vocab_size, self.params["size"]))
#         i = 0
#         j = 0

#         for word, idx in word_index.items():
#             i += 1
#             if word in model_vectors.index_to_key:
#                 j += 1
#                 embedding_vector = model_vectors[word]
#                 if embedding_vector is not None:
#                     embedding_matrix[idx] = model_vectors[word]

#         word_rate = np.round(j / i, 4)
#         print("Word embedding rate : ", word_rate)
#         print("Embedding matrix: %s" % str(embedding_matrix.shape))
#         return (embedding_matrix, vocab_size)

#     def create_keras_model(self, X_train):
#         print("Build & train Word2Vec model ...")
#         X_train_token = X_train.str.split()
#         w2v_model = gensim.models.Word2Vec(
#             min_count=self.params["min_count"],
#             window=self.params["window"],
#             vector_size=self.params["size"],
#             seed=42,
#             workers=1,
#         )
#         w2v_model.build_vocab(X_train_token)
#         w2v_model.train(
#             X_train_token,
#             total_examples=w2v_model.corpus_count,
#             epochs=self.params["epochs"],
#         )
#         model_vectors = w2v_model.wv
#         print("Vocabulary size: %i" % len(model_vectors.index_to_key))
#         print("Word2Vec trained")

#         tokenizer = keras.preprocessing.text.Tokenizer()
#         tokenizer.fit_on_texts(X_train_token)
#         embedding_matrix, vocab_size = self.create_embeding(
#             tokenizer.word_index, model_vectors
#         )

#         word_input = keras.layers.Input(shape=(self.params["maxlen"],), dtype="float64")
#         word_embedding = keras.layers.Embedding(
#             input_dim=vocab_size,
#             output_dim=self.params["size"],
#             weights=[embedding_matrix],
#             input_length=self.params["maxlen"],
#         )(word_input)
#         word_vec = keras.layers.GlobalAveragePooling1D()(word_embedding)
#         embed_model = keras.models.Model([word_input], word_vec)
#         print(embed_model.summary())

#         return (embed_model, tokenizer)

In [44]:
%%time
# KerasEmbed = KerasEmbedTransformer().init(description["Word2Vec"])
pipeline_w2v = pipeline.Pipeline(
    [
        (
            "keras_embed_transformer",
            KerasEmbedTransformer().init(description["Word2Vec"]),
        ),
        (
            "Word2Vec",
            multiclass.OneVsRestClassifier(
                svm.SVC(
                    cache_size=1080,
                    max_iter=description["Word2Vec"]["max_iter"],
                    random_state=42,
                )
            ),
        ),
    ]
)

if description["Word2Vec"]["actif"]:
    with ignore_warnings(category=exceptions.ConvergenceWarning):
        pipeline_w2v.fit(X_train, y_train)
#         pipeline_w2v.fit(X_train[0:10], y_train[0:10])

CPU times: user 121 µs, sys: 0 ns, total: 121 µs
Wall time: 124 µs


In [45]:
%%time
if description["Word2Vec"]["actif"]:
    pred_word2vec = pipeline_w2v.predict(X_test)
    print(pred_word2vec)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs


### i got the same issue as:
https://stackoverflow.com/questions/37984304/how-to-save-a-scikit-learn-pipline-with-keras-regressor-inside-to-disk  
Solution is to save keras model in separate file than remove it from the pipeline so it can be save with joblib  
I did that in my `KerasEmbedTransformer` class `save` & `load` method

In [46]:
if description["Word2Vec"]["actif"]:
    pipeline_w2v.named_steps["keras_embed_transformer"].save("keras")
    # Otherwise i can save it but not load in api
    #     pipeline_w2v.named_steps["keras_embed_transformer"] = "passthrough"  # __module__
    #     pipeline_w2v.named_steps["keras_embed_transformer"].__module__ = None
    ## i have issue loading the joblib in api so i try to delete the step from the pipeline
    pipeline_w2v.steps.pop(0)
    pipeline_w2v.steps.insert(0, ("keras_embed_transformer", "passthrough"))
    print(pipeline_w2v)

In [47]:
if description["Word2Vec"]["actif"]:
    scores_word2vec = save_score(y_test, pred_word2vec, target_names, "Word2Vec")
    joblib.dump(pipeline_w2v, "Word2Vec_model.joblib")
    print(scores_word2vec.sort_values(by="precision", ascending=False))

### test to reimport saved model

In [48]:
if description["Word2Vec"]["actif"] and LONG:
    loaded = joblib.load("/kaggle/working/Word2Vec_model.joblib")
    loaded.named_steps["keras_embed_transformer"] = KerasEmbedTransformer().load(
        "/kaggle/working/keras", description["Word2Vec"]
    )
    pred_word2vec == loaded.predict(X_test)

# BERT

In [49]:
description["BERT"]

{'actif': False, 'max_length': 150, 'epochs': 5}

In [50]:
# import torch
# torch.cuda.is_available()

In [51]:
configuration = BertConfig()
configuration

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [52]:
if description["BERT"]["actif"]:
    slow_tokenizer = BertTokenizer.from_pretrained(
        "bert-base-uncased", model_max_length=description["BERT"]["max_length"]
    )
    save_path = "bert_base_uncased/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    slow_tokenizer.save_pretrained(save_path)
    # from https://keras.io/examples/nlp/text_extraction_with_bert/
    # Load the fast tokenizer from saved file
    tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

In [53]:
def create_bert_input_target(X, params):
    x_encoded = tokenizer.encode(X)
    x_encoded.truncate(params["max_length"])
    x_encoded.pad(params["max_length"])
    #     print(len(x_encoded.ids))
    #     print(len(x_encoded.attention_mask))
    #     params["max_length"]
    #     return np.array(
    #         [
    #             np.array((encoded_id, attention_mask))
    #             for encoded_id, attention_mask in zip(
    #                 x_encoded.ids, x_encoded.attention_mask
    #             )
    #         ]
    #     )
    #     return {"ids": x_encoded.ids, "attention_mask": x_encoded.attention_mask}
    #     return np.array(x_encoded.ids), np.array(x_encoded.attention_mask)
    return np.array(x_encoded.ids)

In [54]:
def create_bert_model(params):
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    inputs = tf.keras.layers.Input(
        shape=(params["max_length"],),
        dtype=tf.int32,
    )
    embedding = encoder(inputs)[0]

    #     layerDense = tf.keras.layers.Dense(
    #         len(target_names), name="start_logit", use_bias=False
    #     )(embedding)
    #     layerDense = tf.keras.layers.Flatten()(layerDense)
    #     end_layer = tf.keras.layers.Activation(tf.keras.activations.softmax)(layerDense)

    #     layerFlat = tf.keras.layers.Flatten()(embedding)
    layerGAvg = tf.keras.layers.GlobalAveragePooling1D()(embedding)
    #     layerDense = tf.keras.layers.Dense(16 * len(target_names), activation="relu")(
    #         layerGAvg
    #     )
    #     layerEnd = tf.keras.layers.Dense(
    #         len(target_names)  # , name="start_logit", use_bias=False
    #     )(layerDense)
    layerActivation = tf.keras.layers.Activation(tf.keras.activations.softmax)(layerGAvg)
    layerEnd = tf.keras.layers.Dense(len(target_names), activation="relu")(layerGAvg)
    model = tf.keras.Model(
        inputs=[inputs],
        outputs=[layerEnd],
    )
    #     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    #     optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    #     model.compile(optimizer=optimizer, loss=[loss, loss])
    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        # https://keras.io/api/metrics/classification_metrics/#auc-class
        #         metrics=tf.keras.metrics.AUC(
        #             num_thresholds=200,
        #             curve="PR",  # "ROC"default
        #             summation_method="interpolation",
        #             name=None,
        #             dtype=None,
        #             thresholds=None,
        #             multi_label=True,
        #             num_labels=len(target_names),
        #             label_weights=None,
        #             from_logits=True,
        #         ),
        metrics=["accuracy"],
    )
    return model

In [55]:
if description["BERT"]["actif"]:
    bert_model = create_bert_model(params=description["BERT"])
    print(bert_model.summary())

In [56]:
# create_bert_input_target(X_train.values[1], params=description["BERT"])

In [57]:
# %%time
# X_train_bert = X_train[0:100].apply(
#     lambda x: create_bert_input_target(x, description["BERT"])
# )
# X_train_bert.values
# # CPU times: user 19 s, sys: 125 ms, total: 19.1 s
# # Wall time: 19.1 s

In [58]:
# %%time
# test = np.array(
#     [create_bert_input_target(x, description["BERT"]) for x in X_train[0:100].values]
# )
# test
# # CPU times: user 19.2 s, sys: 149 ms, total: 19.4 s
# # Wall time: 19.4 s

In [59]:
%%time
if description["BERT"]["actif"]:
    bert_model.fit(
        np.array(
            [create_bert_input_target(x, description["BERT"]) for x in X_train.values]
        ),
        #         X_train_bert,
        y_train,
        epochs=description["BERT"]["epochs"],
        verbose=2,
    )

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [60]:
def convert_pred_to_bool(preds):
    return pd.DataFrame(
        [
            {tag: pred_tag > 0 for pred_tag, tag in zip(pred, target_names)}
            for pred in preds
        ]
    )

In [61]:
%%time
if description["BERT"]["actif"]:
    bert_model.save("BERT")
    pred_bert_proba = bert_model.predict(
        np.array(
            [create_bert_input_target(x, description["BERT"]) for x in X_test.values]
        )
    )
    pred_bert = convert_pred_to_bool(pred_bert_proba)
    print(pred_bert.value_counts())
    scores_bert = save_score(
        y_test,
        pred_bert,
        target_names,
        "BERT",
    )
    print(scores_bert.sort_values(by="f1-score", ascending=False))

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.11 µs


In [62]:
# kerasBertPipeline = tf.keras.models.Sequential()
# kerasBertPipeline(tokenizer, bert_model)

In [63]:
# kerasBertPipeline.compile(
#     optimizer="adam",
#     loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#     metrics=["accuracy"],
# )
# kerasBertPipeline.summary()

In [64]:
# %%time
# kerasBertPipeline.compile(
#     optimizer="adam",
#     loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#     metrics=["accuracy"],
# )
# kerasBertPipeline.fit( X_train,
#     y_train,
#     epochs=1,
#     verbose=2,)

In [65]:
def BertTransformer(sentences: list, params):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModel.from_pretrained("bert-base-uncased")  # ou TFAutoModel
    ## (input_ids,attention_mask,token_type_ids)
    encoded_input = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=params["max_length"],
    )
    #     model.predict(
    #         [
    #             encoded_input["input_ids"],
    #             encoded_input["attention_mask"],
    #             encoded_input["token_type_ids"],
    #         ], batch_size=params["batch_size"]
    #     )

    #     return model
    b_size = params["batch_size"]
    output = [
        ## (last_hidden_state,pooler_output)
        model(
            encoded_input["input_ids"][step : step + b_size],
            attention_mask=encoded_input["attention_mask"][step : step + b_size],
        )
        .last_hidden_state.detach()
        .numpy()
        for step in range(0, len(sentences), b_size)
    ]
    print(model.summary())
    return np.concatenate(output)


#     for step in range(len(sentences)//params["batch_size"]):
#         idx = step*params["batch_size"]
#         encoded_input_batch = encoded_input[idx:idx+params["batch_size"]]
#         output = model(
#             encoded_input_batch["input_ids"], attention_mask=encoded_input_batch["attention_mask"]
#         )
#         print(output["last_hidden_state"].shape)

In [66]:
# encoded_input

In [67]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
# with tpu_strategy.scope():
# ...

In [68]:
# %%time
# model = BertTransformer(list(X_train[0:5]), params=description["BERT"])
# model

In [69]:
# %%time
# model = BertTransformer(list(X_train[0:50_000]), params=description["BERT"])
# # model = BertTransformer(list(X_train), params=description["BERT"])
# model
# # model.mean(axis=1)

# LDA 

# USE