# <strong> A - Importation des modules </strong>
---

In [1]:
############################################
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

###########################################
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline

###########################################
import import_ipynb
from Brice_KENGNI_ZANGUIM_2_1_script_functions_17_08_2022 import *


importing Jupyter notebook from Brice_KENGNI_ZANGUIM_2_1_script_functions_17_08_2022.ipynb


# <strong> B - Importation des données </strong>

In [2]:
data = pd.read_csv("emotion140.csv", encoding = 'ISO-8859-1')

In [3]:
data

Unnamed: 0,labels,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
data.labels[ data.labels == 4 ] = 1

In [5]:
data.labels.value_counts()

1    800000
0    799999
Name: labels, dtype: int64

In [6]:
data.isna().mean()

labels    0.0
text      0.0
dtype: float64

In [7]:
data.duplicated().mean()

0.010193131370707107

- ### <strong> suppression de duplicatats </strong>

In [8]:
data.drop_duplicates(inplace=True)
data.duplicated().mean()

0.0

In [9]:
data.labels.value_counts()

1    793506
0    790184
Name: labels, dtype: int64

# <strong>  C - TRAIN, VALIDATION et TEST datas </strong> 

* ### <strong> Index d'extraction des datas  </strong>

In [10]:
tr_val_test_index = get_train_test_index(data, 'labels',(50000,5000,5000) )

In [11]:
train_data = data.loc[tr_val_test_index[0],:]
validation_data = data.loc[tr_val_test_index[1],:]
test_data = data.loc[tr_val_test_index[2],:]

# <strong> D - Traitement et néttoyage des tweets </strong> 

In [12]:
nlp = spacy.load("en_core_web_sm")

* ### <strong> Pipeline de pré-processing  </strong>

In [13]:
pipe_processing_lemma = Pipeline([("RemoveUserName", RemoveUserName() ),
                                ("LowerCase", ToLowerCase() ),
                                ("Ponctuation", Ponctuation() ),
                                ("Lemmatization",Lemmatization() ),
                                ("StopWords",StopWord() ),
                                ("RemoveSpace",RemoveSpace() ) ,
                                ("RemoveURL",RemoveURL() )
                            ])  
pipe_processing_stem = Pipeline([("RemoveUserName", RemoveUserName() ),
                                ("LowerCase", ToLowerCase() ),
                                ("Ponctuation", Ponctuation() ),
                                ("Stemmatization",Stemmatization() ),
                                ("StopWords",StopWord() ),
                                ("RemoveSpace",RemoveSpace() ) ,
                                ("RemoveURL",RemoveURL() )
                            ])

* ### <strong>  pré-processing  </strong>

In [14]:
preprocess = {"lemma":False,
              "stem" : False }

* * ### <strong>  Lemmatisation  </strong>

In [15]:
if preprocess["lemma"] :
    validation_data_lemma = validation_data.copy()
    test_data_lemma = test_data.copy()
    train_data_lemma = train_data.copy()
    
    validation_data_lemma.text = pipe_processing_lemma.transform(validation_data.text)
    validation_data_lemma = validation_data_lemma[validation_data_lemma.text.apply(lambda x : len(nlp(x)) > 1)]
    validation_data_lemma.to_csv("validation_data_lemma.csv", index=False)
    
    test_data_lemma.text = pipe_processing_lemma.transform(test_data.text)
    test_data_lemma = test_data_lemma[test_data_lemma.text.apply(lambda x : len(nlp(x)) > 1)]
    test_data_lemma.to_csv("test_data_lemma.csv", index=False)
    
    train_data_lemma.text = pipe_processing_lemma.transform(train_data.text)
    train_data_lemma = train_data_lemma[train_data_lemma.text.apply(lambda x : len(nlp(x)) > 1)]
    train_data_lemma.to_csv("train_data_lemma.csv", index=False)
else :
    train_data_lemma = pd.read_csv("train_data_lemma.csv")
    validation_data_lemma = pd.read_csv("validation_data_lemma.csv")
    test_data_lemma = pd.read_csv("test_data_lemma.csv")


* * ### <strong>  stematisation  </strong>

In [16]:
if preprocess["stem"] :
    validation_data_stem = validation_data.copy()
    test_data_stem = test_data.copy()
    train_data_stem = train_data.copy()
    
    validation_data_stem.text = pipe_processing_stem.transform(validation_data.text)
    validation_data_stem = validation_data_stem[validation_data_stem.text.apply(lambda x : len(nlp(x)) > 1)]
    validation_data_stem.to_csv("validation_data_stem.csv", index=False)
    
    test_data_stem.text = pipe_processing_stem.transform(test_data.text)
    test_data_stem = test_data_stem[test_data_stem.text.apply(lambda x : len(nlp(x)) > 1)]
    test_data_stem.to_csv("test_data_stem.csv", index=False)
    
    train_data_stem.text = pipe_processing_stem.transform(train_data.text)
    train_data_stem = train_data_stem[train_data_stem.text.apply(lambda x : len(nlp(x)) > 1)]
    train_data_stem.to_csv("train_data_stem.csv", index=False)
else :
    train_data_stem = pd.read_csv("train_data_stem.csv")
    validation_data_stem = pd.read_csv("validation_data_stem.csv")
    test_data_stem = pd.read_csv("test_data_stem.csv")