# 012-03 - NLP Text Processing - Solution Notebook

* Written by Alexandre Gazagnes
* Last update: 2024-02-01

## About 

Context : 

We are gonna implement our 1st NLP tool ! 

Data  : 

**You can find the dataset [here](https://gist.githubusercontent.com/AlexandreGazagnes/cabe445634a092d308d17a883a305a75/raw/9f785f0f02739ac6352e1d583323771d55270221/nlp.csv).**

## Preliminaries

### System

These commands will display the system information:

Uncomment theses lines if needed. 

In [None]:
# pwd

In [None]:
# cd ..

In [None]:
# ls

In [None]:
# cd ..

In [None]:
# ls

Install various Librairies : 

In [None]:
# !pip install -r requirements.txt >> pip.log
# !pip freeze >> pip.freeze

### Import 

In [None]:
import os, sys, warnings
import pickle
from IPython.display import display

In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
from sklearn.base import *
from sklearn.preprocessing import *
from sklearn.impute import *
from sklearn.model_selection import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.model_selection import *
from sklearn.pipeline import *
from sklearn.feature_extraction import *
from sklearn.dummy import *
from sklearn.feature_extraction.text import *

# from lightgbm import *
# from xgboost import *

from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.neighbors import *

In [None]:
import nltk

# import wordcloud

from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import wordpunct_tokenize

In [None]:
import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

### Graphs and Settings

In [None]:
sns.set()

In [None]:
# warnings.filterwarnings('ignore')
warnings.filterwarnings(action="once")

If needed we can use a TEST_MODE to run the notebook to have a very fast execution : 

In [None]:
TEST_MODE = True

In [None]:
CV = 10  # number of folds for the  cross val
N_JOBS = 7  # number of cpu to use for computations
FRAC = 1.0  # we keep 100% of the dataframe
DISPLAY = True  # display complex viz
TEST_SIZE = 0.25  # Train vs Test %

if TEST_MODE:
    CV = 3
    N_JOBS = -1
    FRAC = 0.3
    DISPLAY = False
    TEST_SIZE = 0.5

### Thrid Parties Tools

We need some Third parties : 

In [None]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")

Some string assets : 

In [None]:
stop_words = list(set(stopwords.words("english")))
# stop_words[:10]

In [None]:
punctuation = list(set(string.punctuation))
punctuation[:10]

In [None]:
word_dict = words.words()
word_dict[:10]

We need to download spacy : 

In [None]:
# !python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg

And to load spacy model :

In [None]:
# nlp = spacy.load("en_core_web_sm")

nlp = spacy.load("en_core_web_md")
nlp

### Data

url of the dataset :

In [None]:
url = "https://gist.githubusercontent.com/AlexandreGazagnes/cabe445634a092d308d17a883a305a75/raw/d2014e8a34bba3c1be3ec8936bb338fb42888f24/nlp.csv"

Download the dataset : 

In [None]:
df = pd.read_csv(url)
df.head(5)

If needed let's take just a specific % of the dataframe : 

In [None]:
if TEST_MODE:
    df = df.sample(frac=FRAC)

In [None]:
df.cat_1.value_counts()

In [None]:
df.shape

Keep a copy of the df : 

In [None]:
DF = df.copy()

## First tour 

### Display

Sample 10 : 

In [None]:
df.sample(10)

### Structure

Info : 

In [None]:
df.info()

Value counts : 

In [None]:
df.dtypes.value_counts()

Specific data types : 

In [None]:
df.select_dtypes(exclude=np.number).nunique()

### Nan & Dupliacted

Any missing values : 

In [None]:
tmp = df.isna().mean(axis=0)
tmp

In [None]:
tmp = df.isna().mean(axis=1)
tmp

Any duplicated : 

In [None]:
df.duplicated().sum()

### Data Inspection

Some numerical stats : 

In [None]:
df.describe()

Other stats :

In [None]:
df.describe(include=object)

Select numeric : 

In [None]:
df.select_dtypes(np.number).columns

Select non numeric : 

In [None]:
df.select_dtypes(object).columns

## Text Exploration

### Display All

In [None]:
[print(i + "\n\n") for i in df.description.head().values]

In [None]:
[print(i + "\n\n") for i in df.description.tail().values]

In [None]:
[print(i + "\n\n") for i in df.description.sample(10).values]

### Display by cat

In [None]:
lim = 200

In [None]:
i = 0


key = df.cat_1.unique()[i]
key

In [None]:
print("--------------------------------------------")
print(f"-------------- {key} --------------- ")
print("--------------------------------------------")
print("\n\n")


tmp = df.loc[df.cat_1 == key, :]
[print(i[:lim] + "\n\n") for i in tmp.description.head(5).values]
[print(i[:lim] + "\n\n") for i in tmp.description.sample(5).values]
[print(i[:lim] + "\n\n") for i in tmp.description.tail(5).values]

In [None]:
def print_categ(i):

    key = df.cat_1.unique()[i]

    print("--------------------------------------------")
    print(f"-------------- {key} --------------- ")
    print("--------------------------------------------")
    print("\n\n")

    tmp = df.loc[df.cat_1 == key, :]
    [print(i[:lim] + "\n\n") for i in tmp.description.head(5).values]
    [print(i[:lim] + "\n\n") for i in tmp.description.sample(5).values]
    [print(i[:lim] + "\n\n") for i in tmp.description.tail(5).values]

Print Categ 1 : 

In [None]:
print_categ(1)

Print Categ 2 : 

In [None]:
print_categ(2)

## From Text To Vector

### Tokenize with NLTK


Create doc from 1st description : 

In [None]:
doc = df.description.iloc[0]
doc

Tokenize : 

In [None]:
tokens = nltk.word_tokenize(doc)
tokens

How Many Tokens?

In [None]:
len(tokens)

Our Stop words : 

In [None]:
stop_words

Our punctuation : 

In [None]:
punctuation

English dictionnary (lower) : 

In [None]:
word_dict
word_dict = [i.lower() for i in word_dict]
word_dict[10000:10010]

Lets build a function : 

In [None]:
def nltk_tokenizer(
    doc: str,
    len_min_word: int = 3,
    force_lower: bool = True,
    remove_stop_words=True,
    remove_punct=True,
    remove_all_digit=True,
    remove_any_digit=False,
    list_dict_word=None,
    list_extra_stop_word=None,
    remove_duplicate=False,
) -> str:

    if force_lower:
        doc = doc.lower()  # if force_lower else doc

    doc = doc.strip()

    tokens = nltk.word_tokenize(doc)

    if remove_stop_words:
        tokens = [t for t in tokens if t not in stop_words]

    if remove_punct:
        tokens = [t for t in tokens if t not in punctuation]

    if len_min_word > 0:
        tokens = [t for t in tokens if len(t) >= len_min_word]

    if remove_all_digit:
        tokens = [t for t in tokens if not t.isdigit()]

    if remove_any_digit:

        def has_a_digit(i):
            for char in i:
                if char.isdigit():
                    return True
            return False

        tokens = [
            t for t in tokens if not has_a_digit(i)
        ]  # any(map(str.isdigit, list(t)))]

    if list_dict_word:
        tokens = [t for t in tokens if t in list_dict_word]

    if list_extra_stop_word:
        tokens = [t for t in tokens if t not in list_extra_stop_word]

    if remove_duplicate:
        tokens = list(set(tokens))

    return " ".join(tokens)

In [None]:
res = nltk_tokenizer(doc)
print(res)
print(len(res))

### Tokenize With Spacy

Same with spacy : 

In [None]:
doc = "I'm so happy to live here, because this will be the most beautiful place on earth!!!"
tokens = nlp(doc)
tokens

Part of speech : 

In [None]:
for t in tokens:
    print(f"{t} => {t.pos_}")

Name Entity recognition : 

In [None]:
for t in tokens:
    print(f"{t} => {t.ent_type_}")

Try it with Paris : 

In [None]:
for t in nlp("i live in Paris"):
    print(f"{t} => {t.ent_type_}")

In [None]:
for t in nlp("i am in love with Paris Hilton"):
    print(f"{t} => {t.ent_type_}")

Type of tokens : 

In [None]:
type(tokens)

Stop words : 

In [None]:
doc = "I'm so happy to live here because this will be the most beautiful place on earth"
tokens = nlp(doc)
tokens = [t for t in tokens if not t.is_stop]
tokens


Punctuation : 

In [None]:
tokens[2].is_punct

In [None]:
doc = "I'm so happy to live here because this will be the most beautiful place on earth"
tokens = nlp(doc)
tokens = [t for t in tokens if not t.is_punct]
tokens

Is Digit : 

In [None]:
doc = "I'm so happy to live here because this will be the most beautiful place on earth"
tokens = nlp(doc)
tokens = [t for t in tokens if not t.text.isdigit()]
tokens

Part of speech  : 

In [None]:
doc = "I'm so happy to live here because this will be the most beautiful place on earth"
tokens = nlp(doc)

pos_list = ["NOUN", "VERB", "ADJ", "ADV"]

tokens = [t for t in tokens if t.pos_ in pos_list]
tokens

Lemmentization : 

In [None]:
doc = "I'm so happy to live here because this will be the most beautiful place on earth"
tokens = nlp(doc)

tokens = [t.lemma_ for t in tokens]
tokens

Let's create a function : 

In [None]:
def spacy_tokenizer(
    doc,
    len_min_word=3,
    force_lower=True,
    remove_stop_words=True,
    remove_punct=True,
    remove_digit_token=True,
    remove_all_digit=True,
    pos_list=["NOUN", "VERB", "ADJ", "ADV"],
    lemmentize=True,
    list_dict_word=None,
    list_extra_stop_word=None,
):
    doc = doc.lower() if force_lower else doc

    doc = doc.strip()

    tokens = nlp(doc)

    if remove_stop_words:
        tokens = [t for t in tokens if not t.is_stop]

    if remove_punct:
        tokens = [t for t in tokens if not t.is_punct]

    tokens = [t for t in tokens if len(t) >= len_min_word]

    if remove_digit_token:
        tokens = [t for t in tokens if not t.text.isdigit()]

    if remove_all_digit:
        tokens = [t for t in tokens if not any(map(str.isdigit, list(t.text)))]

    if pos_list:
        tokens = [t for t in tokens if t.pos_ in pos_list]

    if lemmentize:
        tokens = [t.lemma_ for t in tokens]

    if list_dict_word:
        tokens = [t for t in tokens if t.text in list_dict_word]

    if list_extra_stop_word:
        tokens = [t for t in tokens if t.text not in list_extra_stop_word]

    return " ".join(tokens)

In [None]:
doc = df.description.iloc[0]
res = spacy_tokenizer(doc)
print(res)
print(len(res))

### Count Vectorizer and TFIDF Vectorizer

Let's create an artificial corpus : 

In [None]:
corpus = [
    "my cat is red",
    "my cat is blue",
    "my cat is yellow, i know that is wierd but he is yellow, yellow, yellow",
]
corpus

Building a pd.Series : 

In [None]:
corpus = pd.Series(corpus, name="text")
corpus

Init a Count Vectorizer : 

In [None]:
cv = CountVectorizer()

Fit : 

In [None]:
# X = cv.fit_transform(corpus).toarray()

X = cv.fit(corpus)
X = cv.transform(corpus).toarray()
X.shape

Usefull dataframe : 

In [None]:
X = pd.DataFrame(X, columns=cv.get_feature_names_out())
X

Same with TFIDF : 

In [None]:
tf = TfidfVectorizer()

In [None]:
X = tf.fit_transform(corpus).toarray()
X.shape

In [None]:
X = pd.DataFrame(X, columns=tf.get_feature_names_out())
X

## Modelisation

### By Hand

In [None]:
df.head()

In [None]:
tf = TfidfVectorizer()

X = tf.fit_transform(df.description).toarray()
X.shape

In [None]:
X = pd.DataFrame(X, columns=tf.get_feature_names_out())
X

In [None]:
y = df.cat_1

We can use a much more advanced cross validation tool : 

In [None]:
def cv():
    return StratifiedShuffleSplit(n_splits=CV, test_size=TEST_SIZE)


cv()

Our grid Search : 

In [None]:
grid = GridSearchCV(
    LogisticRegression(),
    {},
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
grid.fit(X, y)

In [None]:
def resultize(grid):

    res = grid.cv_results_
    res = pd.DataFrame(res)

    cols = [i for i in res.columns if "split" not in i]
    res = res.loc[:, cols]

    res = res.drop(columns=["mean_score_time", "std_score_time"])

    return res.round(2).sort_values("mean_test_score", ascending=False)

In [None]:
resultize(grid)

### Using Basic Pipeline

In [None]:
pipeline = Pipeline(
    [
        ("preprocessor", TfidfVectorizer()),
        # ("scaler", StandardScaler()),
        # ("reductor", TruncatedSVD(n_components=100)),
        ("estimator", RandomForestClassifier()),
    ]
)

pipeline

In [None]:
param_grid = {
    "estimator": [
        RandomForestClassifier(),
        # # KNeighborsClassifier(),
        # LGBMClassifier(),
        # XGBClassifier(),
        # XGBRFClassifier(),
        LogisticRegression(),
    ]
}

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)

In [None]:
grid.fit(df.description, y)

In [None]:
resultize(grid)

### Benchmark Pipelines


In [None]:
pst = "passthrough"

pipeline = Pipeline(
    [
        ("preprocessor", TfidfVectorizer()),
        ("imputer", pst),
        ("scaler", pst),
        ("reductor", pst),
        ("estimator", DummyClassifier()),
    ]
)
pipeline

In [None]:
param_grid = {
    "preprocessor": [CountVectorizer(), TfidfVectorizer()],
    "scaler": [
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        # MinMaxScaler(),
        Normalizer(),
        # RobustScaler(),
    ],
    "imputer": [
        pst,
    ],
    "reductor": [
        pst,
    ],
    "estimator": [LogisticRegression(), RandomForestClassifier()],
}

param_grid

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
display(grid)

In [None]:
grid.fit(df.description.values, y)

In [None]:
resultize(grid)

### Add Reductor

In [None]:
param_grid = {
    "preprocessor": [TfidfVectorizer(), CountVectorizer()],  #
    "scaler": [
        pst,
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        Normalizer(),
    ],  # MinMaxScaler() RobustScaler()
    "imputer": [pst],
    "reductor": [TruncatedSVD(n_components=100)],
    "estimator": [
        # KNeighborsClassifier(),
        # XGBRFClassifier(),
        # LGBMClassifier(),
        # XGBClassifier(),
        LogisticRegression(),
        RandomForestClassifier(),
    ],
}

param_grid

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
display(grid)

In [None]:
grid.fit(df.description, y)

In [None]:
display(grid.best_estimator_)

In [None]:
resultize(grid)

### n_components

In [None]:
param_grid = {
    "preprocessor": [TfidfVectorizer(), CountVectorizer()],  #
    "scaler": [
        pst,
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        Normalizer(),
    ],  # MinMaxScaler() RobustScaler()
    "imputer": [pst],
    "reductor": [TruncatedSVD()],
    "reductor__n_components": np.linspace(10, 1_000, 10).astype(int),
    "estimator": [
        # KNeighborsClassifier(),
        # # XGBRFClassifier(),
        # LGBMClassifier(),
        # XGBClassifier(),
        LogisticRegression(),
        RandomForestClassifier(),
    ],
}

param_grid

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
display(grid)

In [None]:
grid.fit(df.description, y)

In [None]:
display(grid.best_estimator_)

In [None]:
resultize(grid)

### Using Advanced Pipelines

In [None]:
class NltkTokenizer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        len_min_word=3,
        force_lower=True,
        remove_stop_words=True,
        remove_punct=True,
        remove_digit_token=True,
        remove_all_digit=True,
        list_dict_word=None,
        list_extra_stop_word=None,
    ):
        self.len_min_word = len_min_word
        self.force_lower = force_lower
        self.remove_stop_words = remove_stop_words
        self.remove_punct = remove_punct
        self.remove_digit_token = remove_digit_token
        self.remove_all_digit = remove_all_digit
        self.list_dict_word = list_dict_word
        self.list_extra_stop_word = list_extra_stop_word

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        f = lambda i: nltk_tokenizer(
            i,
            len_min_word=self.len_min_word,
            force_lower=self.force_lower,
            remove_stop_words=self.remove_stop_words,
            remove_punct=self.remove_punct,
            # remove_digit_token=self.remove_digit_token,
            remove_all_digit=self.remove_all_digit,
            list_dict_word=self.list_dict_word,
            list_extra_stop_word=self.list_extra_stop_word,
        )

        return X.apply(f)

In [None]:
X = NltkTokenizer().fit_transform(df.description)
len(X)

In [None]:
pipeline = Pipeline(
    [
        ("tokenizer", NltkTokenizer()),
        ("preprocessor", TfidfVectorizer()),
        ("estimator", LogisticRegression()),
    ]
)

In [None]:
param_grid = {
    "tokenizer__force_lower": [True, False],
    "tokenizer__len_min_word": [1, 2, 3, 4, 5],
}

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
display(grid)

In [None]:
grid.fit(df.description, y)

In [None]:
display(grid.best_estimator_)

In [None]:
resultize(grid)

In [None]:
class SpacyTokenizer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        len_min_word=3,
        force_lower=True,
        remove_stop_words=True,
        remove_punct=True,
        remove_digit_token=True,
        remove_all_digit=True,
        list_dict_word=None,
        list_extra_stop_word=None,
    ):
        self.len_min_word = len_min_word
        self.force_lower = force_lower
        self.remove_stop_words = remove_stop_words
        self.remove_punct = remove_punct
        self.remove_digit_token = remove_digit_token
        self.remove_all_digit = remove_all_digit
        self.list_dict_word = list_dict_word
        self.list_extra_stop_word = list_extra_stop_word

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        f = lambda i: spacy_tokenizer(
            i,
            len_min_word=self.len_min_word,
            force_lower=self.force_lower,
            remove_stop_words=self.remove_stop_words,
            remove_punct=self.remove_punct,
            remove_digit_token=self.remove_digit_token,
            remove_all_digit=self.remove_all_digit,
            list_dict_word=self.list_dict_word,
            list_extra_stop_word=self.list_extra_stop_word,
        )

        return X.apply(f)

In [None]:
X = SpacyTokenizer().fit_transform(df.description)
len(X)

In [None]:
pipeline = Pipeline(
    [
        ("tokenizer", SpacyTokenizer()),
        ("preprocessor", TfidfVectorizer()),
        ("estimator", LogisticRegression()),
    ]
)

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=CV,
    n_jobs=N_JOBS,
    verbose=1,
    return_train_score=True,
)
display(grid)

In [None]:
# grid.fit(df.description, y)

In [None]:
# display(grid.best_estimator_)

In [None]:
# resultize(grid)