# 0201-02 - NLP Embedding - Solution Notebook

* Written by Alexandre Gazagnes
* Last update: 2024-02-01

## About 

Context : 

Let's get the party started ! 

Data  : 

**You can find the dataset [here](https://gist.githubusercontent.com/AlexandreGazagnes/cabe445634a092d308d17a883a305a75/raw/9f785f0f02739ac6352e1d583323771d55270221/nlp.csv).**

## Preliminaries

### System

These commands will display the system information:

Uncomment theses lines if needed. 

In [None]:
# pwd

In [None]:
# cd ..

In [None]:
# ls

In [None]:
# cd ..

In [None]:
# ls

Install various Librairies : 

In [None]:
# !pip install -r requirements.txt >> pip.log
# !pip freeze >> pip.freeze

### Import 

In [None]:
import os, sys, warnings
import pickle
from IPython.display import display

In [None]:
import pandas as pd
import numpy as np

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.express as px

In [None]:
from sklearn.base import *
from sklearn.preprocessing import *
from sklearn.impute import *
from sklearn.model_selection import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.model_selection import *
from sklearn.pipeline import *
from sklearn.feature_extraction import *
from sklearn.dummy import *
from sklearn.feature_extraction.text import *

# from lightgbm import *
# from xgboost import *

from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.neighbors import *

In [None]:
import nltk
# import wordcloud

from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import wordpunct_tokenize

import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# !pip install gensim

import gensim

from gensim.models import KeyedVectors
from gensim.downloader import load

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_string

In [None]:
# import transformers

In [None]:
from openai import OpenAI
import requests

### Graphs and Settings

In [None]:
# sns.set()

In [None]:
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action="once")

In [None]:
DISPLAY = True

### Thrid Parties Tools

We need some Third parties : 

In [None]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")

Some string assets : 

In [None]:
stop_words = set(stopwords.words("english"))
punctuation = set(string.punctuation)
word_dict = words.words()

We need to download spacy : 

In [None]:
# !python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg

Word2vect : 

In [None]:
w2c = load("word2vec-google-news-300")

And to load spacy model :

In [None]:
# nlp = spacy.load("en_core_web_sm")

nlp = spacy.load("en_core_web_md")

### Data

url of the dataset :

In [None]:
url = "https://gist.githubusercontent.com/AlexandreGazagnes/cabe445634a092d308d17a883a305a75/raw/d2014e8a34bba3c1be3ec8936bb338fb42888f24/nlp.csv"

Download the dataset : 

In [None]:
df = pd.read_csv(url)
df.head(5)

Keep a copy of the df : 

In [None]:
DF = df.copy()

## King - Men + Woman

### With Spacy

Tokenize 'King' : 

In [None]:
king = nlp("king")
king

In [None]:
type(king)

Extract the vector : 

In [None]:
king_v = king.vector
king_v

Length ?

In [None]:
len(king.vector)

Same for Man : 

In [None]:
man = nlp("man")
man_v = man.vector
man_v

In [None]:
len(man_v)

Same for wooman : 

In [None]:
woman = nlp("woman")
woman_v = woman.vector
woman_v

Fancy calculation ! 

In [None]:
res = king_v - man_v + woman_v
res

Length ?

In [None]:
len(res)

Reshape new vector : 

In [None]:
res = res.reshape(1, -1)
res

Compute Similarity : 

In [None]:
vectors = nlp.vocab.vectors.most_similar(res, n=20)
vectors

v1 is : 

In [None]:
v1 = vectors[0][0][0]
v1

vect is :

In [None]:
vect = nlp.vocab[v1]
vect

text is :

In [None]:
vect.text

In [None]:
v2 = vectors[0][0][1]
vect = nlp.vocab[v2]
vect.text

In [None]:
v3 = vectors[0][0][2]
vect = nlp.vocab[v3]
vect.text

Whoooo .... not so good ! 

Lets do the same with a "huge" model : 

In [None]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf

In [None]:
nlp = spacy.load("en_core_web_lg")

Good ? ...

Just re-run previous cells with this code.

What are your conclusions ?

Let's try another last trick : 

In [None]:
doc = "He is one of the most famous kings:  Richard III was the last king of England to die in battle"
doc = nlp(doc)
king = doc[-7]
king

In [None]:
king_v = king.vector

In [None]:
doc = "Fifteen months after the death of King George VI, his daughter Elizabeth is crowned Queen of England"
doc = nlp(doc)
queen = doc[-3]
queen

In [None]:
queen_v = queen.vector

In [None]:
doc = "a female, it's a woman, or a lady, a human of female sex."
doc = nlp(doc)
woman = doc[6]
woman

In [None]:
woman_v = woman.vector

In [None]:
king_v = king.vector

In [None]:
doc = "a boy, a guy, or a man, it's a human being of male sex."
doc = nlp(doc)
man = doc[8]
man

In [None]:
man_v = man.vector

In [None]:
out = nlp.vocab.vectors.most_similar(queen_v.reshape(1, -1), n=20)
out

In [None]:
for t_id in out[0][0]:
    print(nlp.vocab[t_id].text)

### With Doc2Vect

Let's do the same with Pretrained Doct2Vect : 

In [None]:
result = w2c.most_similar(positive=["woman", "king"], negative=["man"], topn=10)
result

## Using Gensim

### Prepare Data

Create y vector : 

In [None]:
y = df.cat_1
y

Create X : 

In [None]:
X = df.description

Cross validation : 

In [None]:
def cv():
    return StratifiedShuffleSplit(n_splits=5, test_size=0.25)


cv()

### By Hand

Our documents : 

In [None]:
documents = df.description
documents[:10]

Init spacy : 

In [None]:
nlp = spacy.load("en_core_web_lg")

Preprocess (clean) the corpus : 

In [None]:
tokenized_docs = [
    [
        token.lemma_
        for token in nlp(doc.lower())
        if not token.is_stop and not token.is_punct
    ]
    for doc in documents
]
tokenized_docs[:10]

Key concept here is a tagged document => Token + id

In [None]:
tagged_docs = [
    TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(tokenized_docs)
]
tagged_docs[:10]

Train the Doc2Vec model

sm : 

In [None]:
# 5s
model_sm = Doc2Vec(
    tagged_docs,
    vector_size=50, # size of output vect
    window=2,  # nb words before and after a target word 
    min_count=1, # minimum frequency count of words. ,
    workers=4, # number of cpu
    epochs=100, # number of iterations (passes over the entire dataset)
)

model_sm

md : 

In [None]:
# 10s
model_md = Doc2Vec(
    tagged_docs, vector_size=100, window=4, min_count=1, workers=4, epochs=200,
)
model_md

lg : 

In [None]:
# 30s
model_lg = Doc2Vec(
    tagged_docs, vector_size=500, window=10, min_count=1, workers=4, epochs=500,)
model_lg

xl : 

In [None]:
# # 15m => 1h
# model_xl = Doc2Vec(
#     tagged_docs,
#     vector_size=1_000,
#     window=10,
#     min_count=1,
#     workers=4,
#     epochs=2_000,
# )
# model_xl


# 1m => 15m
model_xl = Doc2Vec(
    tagged_docs,
    vector_size=1_000,
    window=10,
    min_count=1,
    workers=4,
    epochs=1_000,
)
model_xl

Get the vectors : 

In [None]:
# 2=> 5 mins with xl
doc_vectors = [model_xl.infer_vector(doc) for doc in tokenized_docs]
print(doc_vectors)

Data Type : 

In [None]:
type(doc_vectors)

Length ? : 

In [None]:
len(doc_vectors)

In [None]:
len(doc_vectors[0])

In [None]:
len(df)

Rebuild a 'special' X : 

In [None]:
X = pd.DataFrame(doc_vectors)
X

Shape : 

In [None]:
X.shape

Grid : 

In [None]:
grid = GridSearchCV(
    RandomForestClassifier(), {}, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1,
)

grid.fit(X, y)

display(grid.best_estimator_)

Resultize : 

In [None]:
def resultize(grid, head=20):

    res = pd.DataFrame(grid.cv_results_)
    cols = [i for i in res.columns if "split" not in i]
    res = res.loc[:, cols]
    res = res.round(2).sort_values("mean_test_score", ascending=False).head(head)

    return res


resultize(grid)

### Using a pipeline

In [None]:
pipeline = Pipeline(
    [
        ("preprocessor", "passthrough"),
        ("scaler", "passthrough"),
        ("reductor", "passthrough"),
        ("estimator", LogisticRegression()),
    ]
)

pipeline

What is "passthrough" : 

In [None]:
pst = "passthrough"

pst

Param grid : 

In [None]:
param_grid = {
    "scaler": [
        "passthrough",
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        # MinMaxScaler(),
        Normalizer(),
    ],
    "reductor": [PCA()],
    "reductor__n_components": [0.7, 0.85, 0.9, 0.95, 0.99],
    "estimator": [RandomForestClassifier(), LogisticRegression()],
}
param_grid

New grid : 

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv(),
    n_jobs=-1,
    return_train_score=True,
    verbose=1,
)

grid.fit(X, y)

Results 

In [None]:
display(grid.best_estimator_)

In [None]:
resultize(grid)

### Using a custom transformer 

Our Transformer : 

In [None]:
class Doc2VecTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, model=None, vector_size=500, window=5, min_count=5, epochs=100):

        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.epochs = epochs
        self.model = model

    def fit(self, X, y=None):

        if not isinstance(X, list):
            _X = X.values.tolist()
        else:
            _X = X

        if self.model : 
            return self

        tagged_docs = [
            TaggedDocument(words=preprocess_string(doc), tags=[i])
            for i, doc in enumerate(_X)
        ]
        model = Doc2Vec(
            vector_size=self.vector_size, min_count=self.min_count, epochs=self.epochs
        )
        model.build_vocab(tagged_docs)
        model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)
        self.model = model

        return self

    def transform(self, X, y=None):

        if not isinstance(X, list):
            _X = X.values.tolist()
        else:
            _X = X

        vectors = [self.model.infer_vector(preprocess_string(i)) for i in X]
        return vectors

Original df : 

In [None]:
df

Init d2f : 

In [None]:
d2v = Doc2VecTransformer()
d2v

Fit : 

In [None]:
d2v.fit(df.description)

Transform : 

In [None]:
text = ["my new watch is a very funny flic flac digital chronometer"]
d2v.transform(text)

With pretrained model : 

In [None]:
d2v = Doc2VecTransformer(model=model_xl)
text = ["my new watch is a very funny flic flac digital chronometer"]
d2v.transform(text)

New param grid : 

In [None]:
param_grid = {"preprocessor": [Doc2VecTransformer()]}

New Grid : 

In [None]:
grid = GridSearchCV(
    pipeline, param_grid, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1,
)


Fit : 

In [None]:
grid.fit(df.description, y)

grid.best_estimator_

Results : 

In [None]:
display(grid.best_estimator_)
resultize(grid)

Testing various transformers params : 

In [None]:
param_grid = {
    "preprocessor": [Doc2VecTransformer()],
    "preprocessor__vector_size": [100, 200, 500],
    "preprocessor__window": [5, 10, 15],
    # preprocessor__model = [model_sm, model_md, model_lg, model_xl]
}

Grid : 

In [None]:
grid = GridSearchCV(
    pipeline, param_grid, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1,
)


Fit : 

In [None]:
grid.fit(df.description, y)

grid.best_estimator_

Results : 

In [None]:
res = resultize(grid, head=30)
res


Our problem is : 

In [None]:
# import plotly.express as px
px.scatter_3d(res, x="param_preprocessor__vector_size", y="mean_test_score", z="param_preprocessor__window",)


With box plots : 

In [None]:
px.box(
    res,
    x="param_preprocessor__vector_size",
    y="mean_test_score",
    # color="param_preprocessor__window",
)

In [None]:
px.scatter(
    res.loc[res.param_preprocessor__window == 15],
    x="mean_score_time",
    y="mean_test_score",
    # color="param_preprocessor__window",
)

### Using OpenAI GPT Emedding

Init your client : 

In [None]:
client = OpenAI()
client

Doc : 

In [None]:
doc = df.description.iloc[0]

Just a try : 

In [None]:
response = client.embeddings.create(
    input=doc, model="text-embedding-3-small"
)

What is response : 

In [None]:
response 

The vector : 

In [None]:
vector = response.data[0].embedding
vector

Size?

In [None]:
len(vector)

List of Vectors ? 

In [None]:
# li = []

# for i in df.description.values :
#     # print(i)

#     response = client.embeddings.create(
#     input=i, model="text-embedding-3-small",)
#     vector = response.data[0].embedding
#     li.append(vector)

# li = pd.DataFrame(li)
# li.to_csv("df_from_gpt.csv", index=False)
# li

li = pd.read_csv("df_from_gpt.csv")

With a custom transformer : 

In [None]:
class OpenAIVecTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, model="text-embedding-3-small"):

        self.model = model
        self.client = OpenAI()

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):

        if not isinstance(X, list):
            _X = X.values.tolist()
        else:
            _X = X

        get_vect = lambda  i : self.client.embeddings.create(input=i, model=self.model).data[0].embedding
        X_ = [get_vect(i) for i in X]

        return X_

Let's build a very basic Pipeline / grid search : 

In [None]:
pipeline = Pipeline(
    [
        # ("preprocessor", "passthrough"),
        ("scaler", "passthrough"),
        # ("reductor", "passthrough"),
        ("estimator", RandomForestClassifier()),
    ]
)

pipeline

What is "passthrough" : 

In [None]:
pst = "passthrough"

pst

Param grid : 

In [None]:
param_grid = {
    "scaler": [
        "passthrough",
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        # MinMaxScaler(),
        Normalizer(),
    ],
    # "reductor": [PCA()],
    # "reductor__n_components": [0.7, 0.85, 0.9, 0.95, 0.99],
    "estimator": [
        RandomForestClassifier(),
    ],  # LogisticRegression()
}
param_grid

New grid : 

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv(),
    n_jobs=-1,
    return_train_score=True,
    verbose=1,
)

grid.fit(li, y)

Results 

In [None]:
display(grid.best_estimator_)

In [None]:
resultize(grid)