# Projet 5 : Catégorisez automatiquement des questions

L'objectif de ce projet est d'appliquer des tags à des questions issues du site stackoverflow. Pour cela on va utiliser l'outil StackExchange qui permet de lancer des requêtes SQL sur la base de données de stackoverflow.

## Notebook 3 : Modèle final
* Chargement des données
* Split des données
* Preprocess du texte
* Entrainement du modèle final

## Librairies  utilisées

In [1]:
import pandas as pd
import numpy as np
import pickle

# text preprocessing
import re
from string import punctuation
import nltk
from bs4 import BeautifulSoup
import spacy

# sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score
from sklearn.pipeline import Pipeline

nlp = spacy.load("en_core_web_sm")
#nltk.download()
n_jobs = -1 # enable multiprocessing

## Chargement des données

In [2]:
df = pd.read_csv('df_process.csv', sep=';')

In [3]:
df['Tags_process'] = df['Tags_process'].apply(lambda x: [text[1:-1] for text in x.strip('[]').split(', ')])
df['Tags'] = df['Tags'].apply(lambda x: [text[1:-1] for text in x.strip('[]').split(', ')])

## Split des données

In [4]:
from skmultilearn.model_selection import IterativeStratification

def iterative_train_test_split(X, y, train_size):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    stratifier = IterativeStratification(
        n_splits=2, order=1, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    train_indices, test_indices = next(stratifier.split(X, y))
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [5]:
X = df['Body'].values
y = df['Tags_process'].values

In [6]:
mlb = MultiLabelBinarizer()
y_mlb = mlb.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = iterative_train_test_split(X, y_mlb, train_size = 0.2)

## Preprocess du texte

In [8]:
sw = nltk.corpus.stopwords.words('english')
sw.extend(['error', 'code', 'program', 'question', 'result'])
stemmer = nltk.stem.snowball.SnowballStemmer("english")

In [9]:
def removeTag(text):
    tag_list = ['code','a','img','kbd','del','strike','s']
    soup = BeautifulSoup(text, "html.parser")

    for tag in tag_list:
        for tagless in soup.find_all(tag):
            tagless.decompose()
            
    # to get lowercase text
    return soup.get_text().lower()

In [10]:
def removePunctuation(text):
    cleaned = re.sub('\n',r' ',text)
    # It is prefereable to replace punctuation char by white space to avoid creating new words
    translate_table = dict((ord(char), ' ') for char in punctuation)   
    cleaned = cleaned.translate(translate_table)
    cleaned = re.sub(r'\s+', ' ',cleaned)
    
    return cleaned

In [11]:
def textPreprocessingString(text, allowed_postags=['NOUN']):
    doc = nlp(text)
    cleaned = " ".join([token.lemma_ for token in doc if ((token.pos_ in allowed_postags) and (token.text not in sw))])
    
    return cleaned

In [12]:
def textPreprocessing(text):
    text_notag = removeTag(text)
    text_nopunct = removePunctuation(text_notag)
    
    return textPreprocessingString(text_nopunct)

## Entrainement du modèle final

In [13]:
preprocessor_best = Pipeline(steps=[('transformer', TfidfVectorizer(lowercase=False, preprocessor=textPreprocessing, max_df=0.11, min_df=0))])

In [14]:
base_gb = GradientBoostingClassifier(random_state=0,
                                     max_depth=5,
                                     max_features='auto',
                                     min_samples_leaf=5,
                                     min_samples_split=2,
                                     n_estimators=150
                                    )
ovr_gb = OneVsRestClassifier(base_gb, n_jobs=n_jobs)

In [15]:
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor_best),('model', ovr_gb)])

In [16]:
best_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('transformer',
                                  TfidfVectorizer(lowercase=False, max_df=0.11,
                                                  min_df=0,
                                                  preprocessor=<function textPreprocessing at 0x7f18742555e0>))])),
                ('model',
                 OneVsRestClassifier(estimator=GradientBoostingClassifier(max_depth=5,
                                                                          max_features='auto',
                                                                          min_samples_leaf=5,
                                                                          n_estimators=150,
                                                                          random_state=0),
                                     n_jobs=-1))])

### Score train

In [17]:
y_pred = best_pipeline.predict(X_train)
score = jaccard_score(y_train, y_pred, average='macro')
print(f'score : {score}')

score : 0.8893137401399684


### Score test

In [18]:
y_pred = best_pipeline.predict(X_test)
score = jaccard_score(y_test, y_pred, average='macro')
print(f'score : {score}')

score : 0.2618721122631574


## Sauvegarde

### Binarizer

In [19]:
# Dump the multilabel binarizer with Pickle
mlb_pkl_filename = 'mlb.pkl'
# Open the file to save as pkl file
mlb_pkl = open(mlb_pkl_filename, 'wb')
pickle.dump(mlb, mlb_pkl)
# Close the pickle instances
mlb_pkl.close()

### Pipeline

In [20]:
# Dump the pipeline with Pickle
best_pipeline_pkl_filename = 'best_pipeline.pkl'
# Open the file to save as pkl file
best_pipeline_pkl = open(best_pipeline_pkl_filename, 'wb')
pickle.dump(best_pipeline, best_pipeline_pkl)
# Close the pickle instances
best_pipeline_pkl.close()