In [75]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import Input
from keras.models import Sequential, Model
from keras.layers import Conv2D, Conv1D, Dropout, Dense, Embedding, Flatten, Reshape, Multiply, Lambda, UpSampling1D, MaxPooling1D, LeakyReLU
from keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

<h1>Traitement des données</h1>

In [76]:
# chargement des deux tables
real_news = pd.read_csv("./dataset1/True.csv")
fake_news = pd.read_csv("./dataset1/Fake.csv")

In [77]:
# vrai = 1, faux = 0
real_news["label"] = 1
fake_news["label"] = 0

In [78]:
# création du dataset complet
dataframe = pd.concat([real_news, fake_news])
dataframe.sample(10)

Unnamed: 0,title,text,subject,date,label
20526,"New Kosovo PM pledges dialogue with Serbia, gr...",PRISTINA (Reuters) - Kosovo s newly-elected pr...,worldnews,"September 9, 2017",1
5125,Trump administration to propose 'dramatic redu...,WASHINGTON (Reuters) - The White House budget ...,politicsNews,"March 4, 2017",1
9178,IT JUST GOT REAL! GOP Rep. Jim Jordan Tells Ju...,One of the big players in trying to get to the...,politics,"Dec 17, 2017",0
15242,WOW! DONALD TRUMP HAMMERS OBAMA ON HIS “Terrib...,BRAVO and spot on!Click on picture below to wa...,politics,"Sep 2, 2015",0
7328,Michele Bachmann Believes God Was Punishing O...,While most of us probably wish former Congress...,News,"March 23, 2016",0
14364,Zimbabwe broadcaster on stand-by for address b...,HARARE (Reuters) - Zimbabwe s state broadcaste...,worldnews,"November 20, 2017",1
10959,LAS VEGAS: ILLEGAL ALIEN ARRESTED For Filming ...,Did you know that in the state of Texas alone ...,politics,"May 6, 2017",0
20947,CONGRESS JUST DEALT A BIG BLOW To Obama And Hi...,Obama has shown favoritism towards the Muslim ...,left-news,"Feb 26, 2016",0
8507,"Icahn praises Trump economic plan, says candid...",NEW YORK (Reuters) - Billionaire activist inve...,politicsNews,"August 9, 2016",1
7642,Clinton leads Trump 42 to 36 percent as he los...,NEW YORK (Reuters) - Democrat Hillary Clinton’...,politicsNews,"October 28, 2016",1


In [79]:
print(f"Nombre de références : {dataframe.title.count()}")
print(f"Nombre de fake news : {fake_news.title.count()}")
print(f"Nombre de vraies news : {real_news.title.count()}")

Nombre de références : 44898
Nombre de fake news : 23481
Nombre de vraies news : 21417


In [80]:
# ici on ne s'intéresse qu'au titre et au label
del dataframe["text"]
del dataframe["subject"]
del dataframe["date"]

In [81]:
dataframe.sample(10)

Unnamed: 0,title,label
9883,WATCH: RACIST RAPPER WHO HUNG WHITE KID In Lat...,0
1774,Conservatives ADMIT They Incited Violence At ...,0
17518,WATCH: “IT’S THE MOST WONDERFUL TIME OF THE YE...,0
19860,SARAH JESSICA PARKER FEARS She’ll Be Attacked ...,0
19707,CNN FIRES BLACK DEM Party Chair: New Wikileaks...,0
19391,China calls for restraint when asked about Nor...,1
8592,"U.S., Cuba hold 'substantive' second round tal...",1
893,Presidential Lawyer Comes Forward; Says Trump...,0
1401,Russian Investigation Landing Very Close To T...,0
8757,U.S. lawmakers introduce bill to criminalize ‘...,1


In [82]:
# nettoyage des données (ponctuations)

#stopWords = set(stopwords.words("english"))

def cleanText(text):
    forbidden = {",","@",";","/","-",":",".","!","?", "#","\"","(",")","\'","’","‘","–",".","&"}
    
    if res is None:
        return ""
    res = str(text)
    
    for elm in forbidden:
        res = res.replace(elm, "")
    if len(res.split()) >= 30:
        res = " ".join(res.split()[0:30])
    for elm in forbidden:
        res = res.replace("  ", " ")
    return res

#dataframe["title"] = dataframe["title"].apply(cleanText)

In [83]:
dataframe["title"]

0        As U.S. budget fight looms, Republicans flip t...
1        U.S. military to accept transgender recruits o...
2        Senior U.S. Republican senator: 'Let Mr. Muell...
3        FBI Russia probe helped by Australian diplomat...
4        Trump wants Postal Service to charge 'much mor...
                               ...                        
23476    McPain: John McCain Furious That Iran Treated ...
23477    JUSTICE? Yahoo Settles E-mail Privacy Class-ac...
23478    Sunnistan: US and Allied ‘Safe Zone’ Plan to T...
23479    How to Blow $700 Million: Al Jazeera America F...
23480    10 U.S. Navy Sailors Held by Iranian Military ...
Name: title, Length: 44898, dtype: object

In [84]:
# on sépare les données en données d'entraînement et données de test (80% et 20%)
x_train, x_test, y_train, y_test = train_test_split(dataframe["title"], dataframe["label"], test_size=0.10, random_state = 42)
print(f"Données d'entrainement : {len(x_train)}")
print(f"Données de test : {len(x_test)}")

Données d'entrainement : 40408
Données de test : 4490


In [85]:
max_features = 2000 # taille max du vocab
maxlen = 16 # taille max de séquence

In [86]:
# vectorisation naïve en "one-hot"
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(dataframe["title"])

In [87]:
# vectorisation des données d'entraînement
x_train = tokenizer.texts_to_sequences(x_train)
#x_train = tokenizer.sequences_to_matrix(x_train)
x_train = pad_sequences(x_train, maxlen=maxlen)
y_train = np.array(y_train)

In [88]:
def to_oneHotSentence(sentence, maxfeatures):
    oneHotSentence = []
    for word in sentence:
        oneHotWord = [0 for _ in range(maxfeatures)]
        oneHotWord[word] = 1
        oneHotSentence.append(oneHotWord)
    return oneHotSentence

def to_oneHotCorpus(corpus, maxfeatures):
    oneHotCorpus = []
    for sentence in corpus:
        oneHotCorpus.append(to_oneHotSentence(sentence, maxfeatures))
    return np.array(oneHotCorpus)

def from_oneHotWord(oneHotWord):
    for i in range(len(oneHotWord)):
        if oneHotWord[i] > 0:
            return i
    return 0

def from_oneHotSentence(oneHotSentence):
    sentence = []
    for oneHotWord in oneHotSentence:
        sentence.append(from_oneHotWord(oneHotWord))
    return sentence
        
def from_oneHotCorpus(oneHotCorpus):
    return np.argmax(oneHotCorpus, axis=2)


#x_train = to_oneHotCorpus(x_train, max_features)
#x_train = from_oneHotCorpus(x_train)

In [89]:
# vectorisation des données de test
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=maxlen)
#x_test = to_oneHotCorpus(x_test, max_features)
y_test = np.array(y_test)

In [90]:
batch_size = 256
nb_epochs = 20
embedded_dim = 100
latent_dim = 100
kernel_size = 5

In [91]:
def generate_latent_points(latent_dim, nbpoints):
    return np.random.uniform(0, 1, size=[nbpoints, latent_dim])

def getLatentSamples(latent_dim, n):
    labels = np.zeros(shape=n)
    samples = generate_latent_points(latent_dim, n)
    
    return samples, labels

def getFakeSamples(generator, latent_dim, n):
    labels = np.ones(shape=n)
    
    latent_points = generate_latent_points(latent_dim, n)
    samples = generator.predict(latent_points)
    
    return samples, labels

def getRealSamples(X, Y, n):
    random_indices = np.random.randint(0, X.shape[0], n)

    samples = X[random_indices]
    labels = Y[random_indices]
    
    return samples, labels

def generateFakeNews(model, n):
    few_points = generate_latent_points(latent_dim, n)
    predictions = generator.predict(few_points)
    fake_news = np.round(predictions)#np.argmax(predictions, axis=2)
    fake_news = tokenizer.sequences_to_texts(fake_news)
    
    return fake_news

In [92]:
def create_generator(dim):
    
    input_layer = Input(shape=[dim])
    
    x = Dense(4 * 64, input_shape=[dim])(input_layer)
    
    x = Reshape((4, 64))(x)
    
    x = Conv1D(64, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(64, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(64, kernel_size, padding="same", activation="softmax")(x)
    #x = LeakyReLU(alpha=0.2)(x)
    x = UpSampling1D()(x)
    
    x = Conv1D(128, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(128, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(128, kernel_size, padding="same", activation="softmax")(x)
    #x = LeakyReLU(alpha=0.2)(x)
    x = UpSampling1D()(x)
    
    x = Conv1D(256, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(256, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(256, kernel_size, padding="same", activation="softmax")(x)
    #x = LeakyReLU(alpha=0.2)(x)
    x = UpSampling1D()(x)
    
    x = Conv1D(512, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(512, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(512, kernel_size, padding="same", activation="softmax")(x)
    #x = LeakyReLU(alpha=0.2)(x)
    
    x = Conv1D(1024, kernel_size, padding="same", activation="softmax")(x)
    x = Conv1D(1024, kernel_size, padding="same", activation="softmax")(x)
    #x = LeakyReLU(alpha=0.2)(x)
    x = Flatten()(x)
    
    x = Dense(1024)(x)
    
    x = Dense(maxlen, activation="softmax")(x)
    
    output_layer = Lambda(lambda x: x * max_features)(x)
    
    model = Model(input_layer, output_layer)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0005, beta_1=.5))
    
    model.summary()
    
    return model

def create_discriminator():

    input_layer = Input(shape=[maxlen])
    
    x = Lambda(lambda x: x / max_features)(input_layer)
    
    x = Dense(256)(x)
    x = Reshape((256, 1))(x)
    x = Conv1D(128, kernel_size, padding="same")(x)
    x = MaxPooling1D()(x)
    x = Conv1D(64, kernel_size, padding="same")(x)
    x = MaxPooling1D()(x)
    #x = Dropout(0.1)(x)
    x = Conv1D(32, kernel_size, padding="same")(x)
    
    x = Flatten()(x)
    
    output_layer = Dense(1, activation="sigmoid")(x)
    
    model = Model(input_layer, output_layer)
    
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    model.summary()
    
    return model

def create_gan(generator, discriminator, latent_dim):
    
    input_layer = Input(shape=[latent_dim])
    
    x = generator(input_layer)
    
    output_layer = discriminator(x)
    
    model = Model(input_layer, output_layer)

    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    
    return model

In [93]:
discriminator = create_discriminator()
generator = create_generator(latent_dim)
gan = create_gan(generator, discriminator, latent_dim)

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 16)                0         
_________________________________________________________________
lambda_7 (Lambda)            (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 256)               4352      
_________________________________________________________________
reshape_7 (Reshape)          (None, 256, 1)            0         
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 256, 128)          768       
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 128, 128)          0         
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 128, 64)           410

In [94]:
def train(generator_model, discriminator_model, gan_model, nb_epochs, batch_size):
    discriminator_losses = []
    
    generator_losses = []
    
    for i in range(nb_epochs):

        x_fake, y_fake = getFakeSamples(generator_model, latent_dim, batch_size)
        x_real, y_real = getRealSamples(x_train, y_train, batch_size)
        
        discriminator_model.trainable = True

        loss1 = discriminator_model.train_on_batch(x_real, y_real)
        loss2 = discriminator_model.train_on_batch(x_fake, y_fake)
        loss3 = (loss1 + loss2) / 2
        
        discriminator_model.trainable = False
        
        x_gan, y_gan = getLatentSamples(latent_dim, batch_size)        
        
        loss_gan = gan_model.train_on_batch(x_gan, y_gan)

        discriminator_losses.append(loss3)
        
        generator_losses.append(loss_gan)
        
        print(f"Epoch {i}  ;  Discriminator loss : {loss3}   ;    Generator loss : {loss_gan}", end="\r")
        if i % 100 == 0:
            print("\n")
            print(generateFakeNews(generator_model, 1))
        
    return discriminator_losses, generator_losses

In [None]:
DL, GL = train(generator, discriminator, gan, 15000, batch_size)

  'Discrepancy between trainable weights and collected trainable'


Epoch 0  ;  Discriminator loss : 0.6967480182647705   ;    Generator loss : 0.6891824007034302

['all all all all all all all all all all all speech all democrat all all']
Epoch 100  ;  Discriminator loss : 0.5360958576202393   ;    Generator loss : 1.0797394514083862

["to to to u no u he iran's video to to to to to to"]
Epoch 200  ;  Discriminator loss : 0.5230628252029419   ;    Generator loss : 1.1512376070022583

['just with to quake trump to']
Epoch 300  ;  Discriminator loss : 0.522312343120575   ;    Generator loss : 1.13069736957550057

['to of huge video on to fails to to']
Epoch 400  ;  Discriminator loss : 0.5228368043899536   ;    Generator loss : 1.1469784975051884

['she its turkey of for to after boiler warns in of to israel']
Epoch 500  ;  Discriminator loss : 0.5402370691299438   ;    Generator loss : 1.1176868677139282

['it hillary an u being brilliant obama says president shows time case goes u obama trump']
Epoch 600  ;  Discriminator loss : 0.5293115377426147   ;

Epoch 4800  ;  Discriminator loss : 0.42816436290740967   ;    Generator loss : 1.5953807830810547

['to to u in over his parliament rules healthcare against you bernie voters obama world']
Epoch 4900  ;  Discriminator loss : 0.4689466953277588   ;    Generator loss : 1.61586081981658943

['video trump hillary says news secretary election up this crazy fox watch black ahead you video']
Epoch 5000  ;  Discriminator loss : 0.4447019100189209   ;    Generator loss : 1.74902367591857984

['this trump funds watch her video video in s a over trump u s to tucker']
Epoch 5100  ;  Discriminator loss : 0.44322994351387024   ;    Generator loss : 1.5858870744705202

['to to a video and is but twitter ted says one gop migrant about who trump']
Epoch 5200  ;  Discriminator loss : 0.4558199942111969   ;    Generator loss : 1.51682257652282714

['to to for on obama about black israel pro black german former gop he four to']
Epoch 5300  ;  Discriminator loss : 0.45707112550735474   ;    Generator loss

Epoch 9444  ;  Discriminator loss : 0.3643667995929718   ;    Generator loss : 2.40490221977233997

In [None]:
few_points = generate_latent_points(latent_dim, 10)
predictions = generator.predict(few_points)

print(predictions[3])

#predictions = np.argmax(predictions, axis=2)

print(predictions[3])

print(tokenizer.sequences_to_texts(predictions))

In [None]:
# art plastique du turfu featuring le poto matplotlib
from matplotlib import pyplot as plt

In [None]:
figure, ax = plt.subplots(1, 2)
figure.set_size_inches(20,10)

ax[0].plot(DL)

ax[1].plot(GL)

plt.show()

In [None]:
to_predict = dataframe.sample(5)

In [None]:
X_test = to_predict["title"]
X_test

In [None]:
Y_test = to_predict["label"]
Y_test

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_test[0]

In [None]:
model.predict(X_test)

In [None]:
X_test

In [None]:
generator.save("generator.h5")
discriminator.save("discriminator.h5")
tokenizerJSON = tokenizer.to_json()

with open("gan_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizerJSON, ensure_ascii=False))