In [203]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import Input
from keras.models import Sequential, Model
from keras.layers import Conv2D, Conv1D, Dropout, Dense, Embedding, Flatten, Reshape, Multiply, Lambda, UpSampling1D, MaxPooling1D
from keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

<h1>Traitement des données</h1>

In [204]:
# chargement des deux tables
real_news = pd.read_csv("./dataset1/True.csv")
fake_news = pd.read_csv("./dataset1/Fake.csv")

In [205]:
# vrai = 1, faux = 0
real_news["label"] = 1
fake_news["label"] = 0

In [206]:
# création du dataset complet
dataframe = pd.concat([real_news, fake_news])
dataframe.sample(10)

Unnamed: 0,title,text,subject,date,label
10950,Vice President Biden to go to Mexico City Feb....,WASHINGTON (Reuters) - U.S. Vice President Joe...,politicsNews,"February 5, 2016",1
8771,Clinton says Trump is most divisive candidate ...,In a speech weighted with America’s complicate...,politicsNews,"July 13, 2016",1
5897,Sudan summons U.S. charge d'affaires over Trum...,CAIRO (Reuters) - Sudan summoned the U.S. char...,politicsNews,"January 29, 2017",1
188,FBI officials said Clinton 'has to win' race t...,(Reuters) - Senior FBI officials who helped pr...,politicsNews,"December 13, 2017",1
569,Trump Lashes Out At Black CEO For Resigning F...,Donald Trump s stunning neglect to disavow the...,News,"August 14, 2017",0
18711,"With Budapest closer to Moscow, Orban grants m...","ZALAVAR, Hungary (Reuters) - Twice a month a f...",worldnews,"September 29, 2017",1
592,Trump Comes For McConnell AGAIN With Latest S...,Donald Trump seems to think that he can bully ...,News,"August 10, 2017",0
395,A Florida Pizza Hut To Irma-Fleeing Employees...,Residents in Florida who work at a Pizza Hut w...,News,"September 11, 2017",0
19128,Qatar Foreign Minister: blockade pushing it cl...,PARIS (Reuters) - An economic blockade on Qata...,worldnews,"September 25, 2017",1
4861,Trump Adviser Confirms: Trump’s Immigration P...,It s been a hell of a week for the Trump campa...,News,"August 28, 2016",0


In [207]:
print(f"Nombre de références : {dataframe.title.count()}")
print(f"Nombre de fake news : {fake_news.title.count()}")
print(f"Nombre de vraies news : {real_news.title.count()}")

Nombre de références : 44898
Nombre de fake news : 23481
Nombre de vraies news : 21417


In [208]:
# ici on ne s'intéresse qu'au titre et au label
del dataframe["text"]
del dataframe["subject"]
del dataframe["date"]

In [209]:
dataframe.sample(10)

Unnamed: 0,title,label
258,House Intelligence chairman cleared of disclos...,1
5760,Trump's EPA pick vote delayed in boycott by Se...,1
11182,Immigration case could hurt Republican outreac...,1
10364,"WOMAN HOSPITALIZED, UNABLE TO FEEL HER LEGS Af...",0
15425,Two children killed as car crashes into Austra...,1
21163,Muslim pilgrims converge on Jamarat for symbol...,1
10978,"RADICAL “TOLERANT” FEMALE Black Bloc, Antifa L...",0
18407,UNHINGED MIKA Called President Trump “Not Well...,0
7522,Trump Gets Snippy After Being Asked To Dial D...,0
2724,Threatening note left at senator's office amid...,1


In [210]:
# nettoyage des données (ponctuations)

#stopWords = set(stopwords.words("english"))

def cleanText(text):
    forbidden = {",","@",";","/","-",":",".","!","?", "#","\"","(",")","\'","’","‘","–",".","&"}
    res = str(text)
    if res != None:
        for elm in forbidden:
            res = res.replace(elm, "")
    if len(res.split()) >= 30:
        res = " ".join(res.split()[0:30])
    if res != None:
        for elm in forbidden:
            res = res.replace("  ", " ")
    return res

dataframe["title"] = dataframe["title"].apply(cleanText)

In [211]:
dataframe["title"]

0        As US budget fight looms Republicans flip thei...
1        US military to accept transgender recruits on ...
2        Senior US Republican senator Let Mr Mueller do...
3        FBI Russia probe helped by Australian diplomat...
4        Trump wants Postal Service to charge much more...
                               ...                        
23476    McPain John McCain Furious That Iran Treated U...
23477    JUSTICE Yahoo Settles Email Privacy Classactio...
23478    Sunnistan US and Allied Safe Zone Plan to Take...
23479    How to Blow $700 Million Al Jazeera America Fi...
23480    10 US Navy Sailors Held by Iranian Military Si...
Name: title, Length: 44898, dtype: object

In [212]:
# on sépare les données en données d'entraînement et données de test (80% et 20%)
x_train, x_test, y_train, y_test = train_test_split(dataframe["title"], dataframe["label"], test_size=0.10, random_state = 42)
print(f"Données d'entrainement : {len(x_train)}")
print(f"Données de test : {len(x_test)}")

Données d'entrainement : 40408
Données de test : 4490


In [213]:
max_features = 10000 # taille max du vocab
maxlen = 15 # taille max de séquence

In [214]:
# vectorisation naïve en "one-hot"
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(dataframe["title"])

In [215]:
# vectorisation des données d'entraînement
x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=maxlen)
y_train = np.array(y_train)

In [216]:
# vectorisation des données de test
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=maxlen)
y_test = np.array(y_test)

In [217]:
batch_size = 256
nb_epochs = 20
embedded_dim = 100
latent_dim = 100
kernel_size = 10

In [218]:
def generate_latent_points(latent_dim, nbpoints):
    return np.random.uniform(0, 1, size=[nbpoints, latent_dim])

def getLatentSamples(latent_dim, n):
    labels = np.zeros(shape=n)
    samples = generate_latent_points(latent_dim, n)
    
    return samples, labels

def getFakeSamples(generator, latent_dim, n):
    labels = np.ones(shape=n)
    
    latent_points = generate_latent_points(latent_dim, n)
    samples = generator.predict(latent_points)
    
    return samples, labels

def getRealSamples(X, Y, n):
    random_indices = np.random.randint(0, X.shape[0], n)

    samples = X[random_indices]
    labels = Y[random_indices]
    
    return samples, labels

def generateFakeNews(model, n):
    few_points = generate_latent_points(latent_dim, n)
    predictions = generator.predict(few_points)
    fake_news = tokenizer.sequences_to_texts(np.round(predictions))
    
    return fake_news

In [219]:
def create_generator(dim):
    
    input_layer = Input(shape=[dim])
    
    x = Dense(maxlen, input_shape=[dim])(input_layer)
    
    x = Reshape((maxlen, 1))(x)
    x = Conv1D(128, kernel_size, padding="same")(x)
    x = UpSampling1D()(x)
    x = Conv1D(64, kernel_size, padding="same")(x)
    x = UpSampling1D()(x)
    x = Conv1D(32, kernel_size, padding="same")(x)
    
    x = Flatten()(x)
    x = Dense(maxlen, activation="sigmoid")(x)
    
    output_layer = Lambda(lambda x: x * float(max_features))(x)

    model = Model(input_layer, output_layer)
    model.compile(loss="mse", optimizer=Adam(lr=0.0002, beta_1=.5))
    
    model.summary()
    
    return model

def create_discriminator():

    input_layer = Input(shape=[maxlen])
    
    #x = Embedding(max_features, output_dim=embedded_dim, input_length=maxlen, trainable=True, input_shape=[maxlen])(input_layer)
    x = Dense(embedded_dim * maxlen)(input_layer)
    x = Reshape((maxlen, embedded_dim))(x)
    x = Conv1D(128, kernel_size, padding="same")(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(64, kernel_size, padding="same")(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.1)(x)
    x = Conv1D(32, kernel_size, padding="same")(x)
    x = MaxPooling1D(2)(x)
    x = Flatten()(x)
    
    output_layer = Dense(1, activation="softmax")(x)
    
    model = Model(input_layer, output_layer)
    
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    model.summary()
    
    return model

def create_gan(generator, discriminator, latent_dim):
    
    input_layer = Input(shape=[latent_dim])
    
    x = generator(input_layer)
    
    output_layer = discriminator(x)
    
    model = Model(input_layer, output_layer)

    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    model.summary()
    
    return model

In [220]:
discriminator = create_discriminator()
generator = create_generator(latent_dim)
gan = create_gan(generator, discriminator, latent_dim)

Model: "model_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        (None, 15)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 1500)              24000     
_________________________________________________________________
reshape_19 (Reshape)         (None, 15, 100)           0         
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 15, 128)           128128    
_________________________________________________________________
max_pooling1d_25 (MaxPooling (None, 7, 128)            0         
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 7, 64)             81984     
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 3, 64)             0  

In [221]:
def train(generator_model, discriminator_model, gan_model, nb_epochs, batch_size):
    discriminator_losses = []
    
    generator_losses = []
    
    for i in range(nb_epochs):

        x_fake, y_fake = getFakeSamples(generator_model, latent_dim, batch_size)
        x_real, y_real = getRealSamples(x_train, y_train, batch_size)
        
        discriminator_model.trainable = True

        loss1 = discriminator_model.train_on_batch(x_real, y_real)
        loss2 = discriminator_model.train_on_batch(x_fake, y_fake)
        loss3 = (loss1 + loss2) / 2
        
        discriminator_model.trainable = False
        
        x_gan, y_gan = getLatentSamples(latent_dim, batch_size)        
        
        loss_gan = gan_model.train_on_batch(x_gan, y_gan)

        discriminator_losses.append(loss3)
        
        generator_losses.append(loss_gan)
        
        print(f"Epoch {i}  ;  Discriminator loss : {loss3}   ;    Generator loss : {loss_gan}", end="\r")
        if i % 100 == 0:
            print("\n")
            print(generateFakeNews(generator_model, 1))
        
    return discriminator_losses, generator_losses

In [None]:
DL, GL = train(generator, discriminator, gan, 50000, batch_size)

  'Discrepancy between trainable weights and collected trainable'


Epoch 0  ;  Discriminator loss : 35.32588577270508   ;    Generator loss : 1154.1905517578125

['tribal learn toddler cultural tillersons dissidents isolated chiles awarded newest squad petty fireworks struggling advocate']
Epoch 100  ;  Discriminator loss : 8.874604225158691   ;    Generator loss : 1.2377797365188599

['room antitrump time she border ted cnn their turkey lawmakers nyc makes backs brexit hell']
Epoch 200  ;  Discriminator loss : 4.3781633377075195   ;    Generator loss : 1.1954911947250366

['into south army attack show plan military who gop america chief republican more wont wont']
Epoch 300  ;  Discriminator loss : 5.002150535583496   ;    Generator loss : 1.20067250728607185

['gop as for as trump the court trump us after at video to on in']
Epoch 400  ;  Discriminator loss : 3.226269245147705   ;    Generator loss : 1.04040384292602546

['they house is minister as with a and for with fbi video in on trump']
Epoch 500  ;  Discriminator loss : 4.329286098480225   ;  

In [None]:
few_points = generate_latent_points(latent_dim, 10)
predictions = generator.predict(few_points)
#print(predictions)
print(tokenizer.sequences_to_texts(np.round(predictions)))

In [None]:
# art plastique du turfu featuring le poto matplotlib
from matplotlib import pyplot as plt

In [None]:
figure, ax = plt.subplots(1, 2)
figure.set_size_inches(20,10)

ax[0].plot(DL)

ax[1].plot(GL)

plt.show()

In [None]:
to_predict = dataframe.sample(5)

In [None]:
X_test = to_predict["title"]
X_test

In [None]:
Y_test = to_predict["label"]
Y_test

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_test[0]

In [None]:
model.predict(X_test)

In [None]:
X_test

In [None]:
generator.save("generator.h5")
discriminator.save("discriminator.h5")
tokenizerJSON = tokenizer.to_json()

with open("gan_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizerJSON, ensure_ascii=False))