# Pretraitement des données avant de passer au NLP

## Import des bibliothèques

In [11]:
import pandas as pd 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize  import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy

nlp = spacy.load('en_core_web_sm')

## Charger les resources

In [4]:
# Tokenizer => vectorizer => transformer en vecteurs numerique
nltk.download('punkt')
#Stopwords => mots à supprimer
nltk.download('stopwords')
#Lemmatizer => se remettre au radical
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...


True

## Charger le dataset

In [8]:
df = pd.read_csv('Youtube05-Shakira.csv')
df

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z13lgffb5w3ddx1ul22qy1wxspy5cpkz504,dharma pal,2015-05-29T02:30:18.971000,Nice song﻿,0
1,z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj,Tiza Arellano,2015-05-29T00:14:48.748000,I love song ﻿,0
2,z12quxxp2vutflkxv04cihggzt2azl34pms0k,Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿,2015-05-28T21:00:08.607000,I love song ﻿,0
3,z12icv3ysqvlwth2c23eddlykyqut5z1h,Eric Gonzalez,2015-05-28T20:47:12.193000,"860,000,000 lets make it first female to reach...",0
4,z133stly3kete3tly22petvwdpmghrlli,Analena López,2015-05-28T17:08:29.827000,shakira is best for worldcup﻿,0
...,...,...,...,...,...
365,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
366,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
367,_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs,jeffrey jules,2013-07-13T12:09:31.188000,wow,0
368,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0


## Netooyer le dataset

In [28]:
#Supprimer les lignes 
df =df.dropna(subset=['CONTENT'])
#Supprimer les doublons
df = df.drop_duplicates(subset=['CONTENT'])

## Fonction pour tokenizer, enlever les stopwords, lemmatizer ...etc

In [12]:
#fonction pour nettoyer, tokenizer, lemmatiser  et stemmer
def process_text(text):
    #convertir en minuscules
    text = text.lower()
    #Supprimer les caractères non alphabétiques
    text=re.sub(r'[^a-zA-Z\s]','',text)
    #Enlever html
    text=re.sub(r'\d+','',text)
    #supprimer la ponctuation
    text=re.sub(r'[^\w\s]', '', text)
    #Tokenisation avec NLTK
    tokens= word_tokenize(text)
    #supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    #lemmatisation avec spacy
    doc = nlp(" ".join(tokens))
    lemmatized_tokens=[token.lemma_ for token in doc]
    #stemming 
    stemmer = PorterStemmer()
    stemmer_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    #Rejoindre les tokens en une chaine
    processed_text = ''.join(stemmer_tokens)
    return  processed_text

**Cerner la colonne à cleaner

In [14]:
columns_to_clean = ['CONTENT']
#Appliquer la fonction à cette colonne
for column in columns_to_clean:
    df[column + '_processed']= df[column].apply(process_text)

#Exporter les données clean en csv sans les indices
df.to_csv("SHAKIRA1.csv",index=False)

In [15]:
df2 =pd.read_csv("SHAKIRA1.csv")

In [17]:
df3 = df2[df2['CLASS']==1]
df3

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,CONTENT_processed
6,z12uujnj2sifvzvav04chpypvofvexpoggg,Sudheer Yadav,2015-05-28T10:28:25.133000,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,1,seesongopengoogltypeshakiraguruofmovi
21,z13zjlpo2nbehxwf322gelhzwmqwgn1mt,Raafat saeed,2015-05-27T04:19:29.178000,Check out this playlist on YouTube:﻿,1,checkplaylistyoutub
29,z13uhhxp5nvig15yc04citszvtagwtmpqcc,Terry Short,2015-05-26T14:33:52.496000,Support the fight for your 4th amendment right...,1,supportfightthamendrightprivacihomestopnsaspia...
34,z13gv1bxbuytgjl3o23fdr5r3kaadbbm1,‫حلم الشباب‬‎,2015-05-25T23:42:49.533000,Check out this video on YouTube:﻿,1,checkvideoyoutub
49,z12bfraboyajftgbz04ccbkr3xjxfxyxsew,Abdullah Fawzi,2015-05-25T06:25:22.319000,"coby this USL and past :<br /><a href=""http://...",1,cobiuslpastbrhrefhttpadflyhttpadflyahmvtxbrdel...
...,...,...,...,...,...,...
316,z12xc3ly4x3uttmci22xff24nqqxwb0je04,Lisa Matthews,2013-07-17T13:56:03.233000,Check out this video on YouTube:<br />&quot;Th...,1,checkvideoyoutubebrquotthitimeafricaquotonetra...
318,_2viQ_Qnc69GH3FQl348HonbRxpbmtsR5CUei0zkJog,Riley Rollins,2013-07-16T00:30:46.660000,"O peoples of the earth, I have seen how you pe...",1,peoplearthseeperformeveriformevilleisurceasrev...
321,_2viQ_Qnc6-qHJ_u9Yv84vj4yOAPLUL3ZibCc7b-vBI,FAHAD KHAN,2013-07-14T22:06:57.712000,I WILL NEVER FORGET THIS SONG IN MY LIFE LIKE ...,1,neverforgetsonglifelikecommenthearsonglikeyear
322,_2viQ_Qnc6_HU65mTzCmXnjA-WLt7XqxqPj7EwAtlO0,ricky swaggz,2013-07-14T20:40:00.331000,********OMG Facebook is OLD! Check out ------...,1,omgfacebookoldcheckgtswagfriendcommakethousand...


## Comparaison des deux datasets (ancien et le clean)

In [18]:
shape_original = df.shape
print(shape_original)


(331, 6)


In [21]:
shape_clean = df3.shape
print(shape_clean)

(150, 6)


**On remarque le nombre de lignes a diminué (sans les doublons et les nan)**

### **On peut calculer la distance pour avoir des information sur la similarité entre les data clean et brutes**

In [26]:
import gensim
from gensim.models import Word2Vec
import sklearn
from  sklearn.manifold import TSNE


## Declaration du model 

In [27]:
model = Word2Vec(df2["CONTENT"],vector_size=100,)
model2 = Word2Vec(df2["CONTENT_processed"])

TypeError: 'float' object is not iterable