In [23]:
### -------- Chargement des libraries ------- 
import os
#import tensorflow as tf

# Ce dont nous avons besoin depuis tensorflow.keras

# from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
#from tensorflow.keras import Model
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.callbacks import EarlyStopping
#from tensorflow.keras.initializers import TruncatedNormal
#from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
#from tensorflow.keras.metrics import CategoricalAccuracy
#from tensorflow.keras.utils import to_categorical

#from transformers import TFDistilBertModel, DistilBertConfig

# Et pandas pour l'importation de donnees + sklearn pour le decoupage des donnees.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm 

In [2]:
import re
import unidecode
def cleanText(string: str, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''', stop_words=['the', 'a', 'and', 'is', 'be', 'will','on'])->str:
    """ A method to clean text """
    string=unidecode.unidecode(string)
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)
    
    # Nettoyage email
    #string = re.sub(r'[at]', '@', string)
    #string = re.sub(r'[dot]', '.', string)
        
    # Cleaning the urls
    string = re.sub(r'\[.*\]', '', string)
    
    string = re.sub(r'\]', '', string)
    string = re.sub(r'\[', '', string)
    
    string = re.sub(r'\([A-Z]+\)', '', string)
    string = re.sub(r'\([0-9]+\)', '', string)
    string = re.sub(r'[0-9]+', '#number', string)
    string = re.sub(r'\.+', '.', string)
    
    string = re.sub(r'Dr\.', 'Doctor', string)
    

    # Removing the punctuations
    #for x in string.lower(): 
    #    if x in punctuations: 
    #        string = string.replace(x, "") 

    # Converting the text to lower
    #string = string.lower()

    # Removing stop words
    #string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string  

In [16]:
### --------- Importation et pretraitement des donées --------- 
## Importation des données

dirname = '../data'
df_X = pd.read_json(os.path.join(dirname,'train.json'))
df_label=pd.read_csv(os.path.join(dirname,'train_label.csv'))

data=pd.merge(df_X, df_label).drop(['Id'], axis = 1)

data['gender'] = pd.Categorical(data['gender'].replace({'M': 0, 'F': 1}))
data['Category'] = pd.Categorical(data['Category'])

data['description']=data.description.apply(lambda x: cleanText(x))
# Découpage Train - Validation
train_data, valid_data = train_test_split(data,shuffle=True,test_size=0.3)

In [8]:
names=pd.read_csv(os.path.join(dirname,'categories_string.csv'))['0'].to_dict()

In [9]:
def compute_classe2augment(df):
    """renvoie la liste des classes à augmenter"""
    # group by Category
    gouped=df.groupby(["Category"]).count().sort_values(by="description")#
    gouped["name"]=gouped.index.map(names)
    return(gouped[gouped.description<=gouped.description.quantile(0.12)])

In [43]:
df_count=compute_classe2augment(data)
print(df_count)

          description  gender               name
Category                                        
21                783     783             rapper
4                 807     807   personal_trainer
10                831     831                 dj
7                 858     858  interior_designer


In [44]:
def split_desc(txt):
    """Découpe le texte en phrase"""
    txt=str(txt)
    #print(re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").split(txt))
    nptxt=np.array([[txt.strip()] for txt in re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").split(txt)])
    n=len(nptxt)
    nptxt=nptxt.reshape(1, n)
    return(nptxt[0])

In [45]:
X_train.shape

(152968,)

In [46]:
def augment_swap(txt):
    # Découpage en phrase
    array_txt=split_desc(txt)
    # Assemblage random
    np.random.shuffle(array_txt)
    new_txt=''.join(array_txt)
    return(new_txt)

In [14]:
#### Swap Sentences
def swap_sentences(dataset,k_list,frac_new=0.5):
    new_text=[]
    for k in k_list:        
        ## Selection des exemples à augmenter
        df_n=dataset[dataset.Category==k].reset_index(drop=True).sample(frac=frac_new,replace=False)
        ## data augmentation loop
        for i in tqdm(range(0,len(df_n))):
            single_desc = df_n.iloc[i]['description']
            new_phrase = augment_swap(single_desc)
            new_text.append({'description':new_phrase,'Category':k})
    
    return new_text

In [37]:
new_desc_swap=swap_sentences(train_data,k_list=df_count.index,frac_new=0.4)
new_train_data=pd.concat([train_data,pd.DataFrame(new_desc_swap)])

100%|██████████| 230/230 [00:00<00:00, 3057.13it/s]
100%|██████████| 230/230 [00:00<00:00, 3051.83it/s]
100%|██████████| 230/230 [00:00<00:00, 3012.23it/s]
100%|██████████| 241/241 [00:00<00:00, 3069.07it/s]


In [47]:
def augment_crossover(desc_list,k=None):
    """Mélange des decription de la même classe"""
    new_desc=''
    for desc in desc_list:
        desc= split_desc(desc)
        desc=np.random.choice(desc,size=int(0.1*len(desc))+1, replace=False)
        ## Selection aléatoire de la moitié de chaque  phrase 
        new_desc+=''.join(desc)    
    return(new_desc)

In [48]:
def cross_over(dataset,k_list,num_new=10,frac_crossover=0.5):
    new_text=[]
    
    for k in k_list:        
        ## Selection des exemples à augmenter
        df=dataset[dataset.Category==k].reset_index(drop=True)
        
        for num_new in tqdm(range(0,num_new)):
            df_n=df.sample(frac=frac_crossover,replace=False)
            new_phrase = augment_crossover(df_n.description)
            new_text.append({'description':new_phrase,'Category':k})

    return new_text

In [61]:
## Augment_CrossOver
new_desc_cross_over=cross_over(train_data,df_count.index,num_new=200,frac_crossover=0.9)
new_train_data=pd.concat([train_data,pd.DataFrame(new_desc_cross_over)])

100%|██████████| 200/200 [00:04<00:00, 49.57it/s]
100%|██████████| 199/199 [00:03<00:00, 52.28it/s]
100%|██████████| 198/198 [00:03<00:00, 50.80it/s]
100%|██████████| 197/197 [00:03<00:00, 50.24it/s]


In [62]:
X_train=new_train_data.description
y_train=new_train_data.Category
X_valid=valid_data.description
y_valid=valid_data.Category

In [63]:
X_train.shape

(152831,)

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier

# When use pipeline class
tfidf_vect=TfidfVectorizer(min_df=10,ngram_range=(1, 2))

estimators = [
    ('rf', RandomForestClassifier(n_estimators=100,n_jobs=-1)),
    ('svr', make_pipeline(StandardScaler(),LinearSVC())),
    ('SGD', SGDClassifier(loss="modified_huber",early_stopping=True,n_jobs=-1))
]


clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
pipeline = Pipeline([('vector',tfidf_vect),("clf",clf)])

In [74]:
pipeline.fit(X_train,y_train)

KeyboardInterrupt: 

In [75]:
from sklearn.metrics import f1_score
pred=pipeline.predict(X_valid)
f1_score(pred,y_valid,average="macro")

AttributeError: 'StackingClassifier' object has no attribute 'estimators_'