# 1 Preliminary

In [None]:
cd ..

In [None]:
pwd

In [None]:
ls

## 1.1 Install

Install 3rd party tools : 

In [None]:
# !sudo apt install unzip tree htop

Install various Librairies : 

In [None]:
# !pip install -r requirements.txt >> pip.log
# !pip freeze >> pip.freeze

In [None]:
!python -m spacy download en_core_web_sm >> spacy.log
!python -m spacy download en_core_web_md >> spacy.log
# !python -m spacy download en_core_web_lg >> spacy.log

## 1.2 Import 

In [None]:
import os, sys, warnings
import pickle
from IPython.display import display

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import missingno as msno

from sklearn.base import *
from sklearn.preprocessing import *
from sklearn.impute import *
from sklearn.model_selection import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.model_selection import *
from sklearn.pipeline import *
from sklearn.feature_extraction import *
from sklearn.dummy import *
from sklearn.feature_extraction.text import *

from lightgbm import *
from xgboost import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.neighbors import *


import nltk
import wordcloud

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import gensim
import transformers

import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import wordpunct_tokenize
import string

In [None]:
# nlp = spacy.load('en_core_web_lg')

## 1.3 Graphs and Settings

In [None]:
sns.set()

In [None]:
# warnings.filterwarnings('ignore')
warnings.filterwarnings(action="once")

In [None]:
DISPLAY = True

## 1.4 Data

In [None]:
source = "./data/cleaned/"

In [None]:
df = pd.read_csv(source + "df2.csv")
DF = df.copy()
df.head(2)

# 2 First tour 

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes.value_counts()

In [None]:
tmp = df.isna().mean(axis=0)
tmp

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.describe(include=object)

# 3 Prepare Data

In [None]:
y = df.cat_1
y

In [None]:
X = df.description

In [None]:
def cv():
    return StratifiedShuffleSplit(n_splits=5, test_size=0.25)


cv()

# 4 Using Gensim

## 4.1 By Hand

In [None]:
documents = df.description
documents[:10]

In [None]:
# Tokenize and preprocess the documents using spaCy
tokenized_docs = [
    [
        token.lemma_
        for token in nlp(doc.lower())
        if not token.is_stop and not token.is_punct
    ]
    for doc in documents
]
tokenized_docs[:10]

In [None]:
# Create tagged documents
tagged_docs = [
    TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(tokenized_docs)
]
tagged_docs[:10]

In [None]:
# Train the Doc2Vec model
model = Doc2Vec(
    tagged_docs, vector_size=50, window=2, min_count=1, workers=4, epochs=100
)
model

In [None]:
# Get the document vectors
doc_vectors = [model.infer_vector(doc) for doc in tokenized_docs]
print(doc_vectors)

In [None]:
type(doc_vectors)

In [None]:
len(doc_vectors)

In [None]:
len(df)

In [None]:
X = pd.DataFrame(doc_vectors)
X

In [None]:
X.shape

In [None]:
grid = GridSearchCV(
    LogisticRegression(), {}, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1
)

grid.fit(X, y)

In [None]:
display(grid.best_estimator_)

res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if "split" not in i]
res = res.loc[:, cols]
res.round(2).sort_values("mean_test_score", ascending=False).head(10)

In [None]:
pst = "passthrough"

pipeline = Pipeline(
    [
        ("preprocessor", pst),
        ("scaler", pst),
        ("reductor", pst),
        ("estimator", LogisticRegression()()),
    ]
)

pipeline

In [None]:
param_grid = {
    "scaler": [
        "passthrough",
        StandardScaler(),
        QuantileTransformer(n_quantiles=100),
        MinMaxScaler(),
        Normalizer(),
    ],
    "reductor": [PCA()],
    "reductor__n_components": [0.7, 0.85, 0.9, 0.95, 0.99],
    "estimator": [RandomForestClassifier(), LGBMClassifier(), LogisticRegression()],
}
param_grid

In [None]:
grid = GridSearchCV(
    pipeline, param_grid, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1
)

grid.fit(X, y)

In [None]:
display(grid.best_estimator_)

res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if "split" not in i]
res = res.loc[:, cols]
res.round(2).sort_values("mean_test_score", ascending=False).head(10)

## 4.2 using Pipeline

In [None]:
pst = "passthrough"

pipeline = Pipeline(
    [
        ("preprocessor", pst),
        ("scaler", pst),
        ("reductor", pst),
        ("estimator", RandomForestClassifier()),
    ]
)

pipeline

In [None]:
class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=5, epochs=10):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.epochs = epochs
        self.model = None

    def fit(self, X, y=None):
        tagged_data = [
            TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(X)
        ]
        self.model = Doc2Vec(
            tagged_data,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            epochs=self.epochs,
        )
        return self

    def transform(self, X, y=None):
        return [self.model.infer_vector(doc.split()) for doc in X]

In [None]:
param_grid = {"preprocessor": [Doc2VecTransformer()]}

In [None]:
grid = GridSearchCV(
    pipeline, param_grid, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1
)

grid.fit(tokenized_docs, y)

grid.best_estimator_

In [None]:
display(grid.best_estimator_)

res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if "split" not in i]
res = res.loc[:, cols]
res.round(2).sort_values("mean_test_score", ascending=False).head(10)

In [None]:
# doc2vec_transformer = Doc2VecTransformer(vector_size=100, window=5, min_count=5, epochs=10)
# tfidf_vectorizer = TfidfVectorizer()
# logistic_regression = LogisticRegression()

In [None]:
param_grid = {
    "preprocessor": [Doc2VecTransformer()],
    "preprocessor__vector_size": [100, 200, 300],
    "preprocessor__window": [5, 10, 15],
}

In [None]:
grid = GridSearchCV(
    pipeline, param_grid, cv=cv(), n_jobs=-1, return_train_score=True, verbose=1
)

grid.fit(tokenized_docs, y)

grid.best_estimator_

In [None]:
display(grid.best_estimator_)

res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if "split" not in i]
res = res.loc[:, cols]
res.round(2).sort_values("mean_test_score", ascending=False).head(10)