## les bibliothèques

In [19]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pandas as pd
import spacy 

## Import des données 

In [24]:
df = pd.read_csv('data_new_fake_theonion.csv')

In [25]:

df

Unnamed: 0.1,Unnamed: 0,articles
0,0,"DAYTON, OH—While greeting the crowd at a campa..."
1,1,Presidential candidate Donald Trump recently r...
2,2,And the RNC is going to pay for it. It probabl...
3,3,"VANDALIA, OH—Drawing criticism for what many c..."
4,4,WASHINGTON—Her mind spinning as she poured ove...
...,...,...
1705,1705,WASHINGTON—Claiming it felt queasy just thinki...
1706,1706,"WALDPORT, OR—A team of anthropologists announc..."
1707,1707,Donald Trump has stated publicly multiple time...
1708,1708,WASHINGTON—Election boards across the country ...


## Nettoyage des données 

In [26]:
# Tokenizer => vectorizer => transformer en vecteurs numerique
nltk.download('punkt')
#Stopwords => mots à supprimer
nltk.download('stopwords')
#Lemmatizer => se remettre au radical
nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rymkm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Nettoyage des données**
reduire la dim enleve les prefix et suffixes => modele plus performant 

In [31]:
#fonction pour nettoyer, tokenizer, lemmatiser  et stemmer
def process_text(text):
    #convertir en minuscules
    text = text.lower()
    #Supprimer les caractères non alphabétiques
    text=re.sub(r'[^a-zA-Z\s]','',text)
    #Enlever html
    text=re.sub(r'\d+','',text)
    #supprimer la ponctuation
    text=re.sub(r'[^\w\s]', '', text)
    #Tokenisation avec NLTK
    tokens= word_tokenize(text)
    #supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    #lemmatisation avec spacy
    doc = nlp(" ".join(tokens))
    lemmatized_tokens=[token.lemma_ for token in doc]
    #stemming 
    stemmer = PorterStemmer()
    stemmer_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    #Rejoindre les tokens en une chaine
    processed_text = ' '.join(stemmer_tokens)
    return  processed_text

In [32]:
df['processed_text'] = df['articles'].apply(process_text)
df['processed_text'][0]

'dayton ohwhil greet crowd campaign ralli thursday former presid donald trump see kiss support burger accord sourc attend well who s juici littl guy ask gop presidenti candid reportedli lift fulli load flamebroil beef hamburg hand press lip sesam seed bun emit audibl muah ounc wish could vote you re ador littl tasti burger be not be not preciou could take big bite right remind sandwich back home sourc confirm trump handler eventu interven would releas burger grip let ralli attende finish eat'

## Word2Vect

In [35]:
import gensim
x_tokenized= [sentence.split() for sentence in df['processed_text']]
model = gensim.models.Word2Vec(x_tokenized,min_count=100,window=5,vector_size=100)
len(model.wv["effort"])


100