In [193]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import Input
from keras.models import Sequential, Model
from keras.layers import Conv2D, Conv1D, Dropout, Dense, Embedding, Flatten, Reshape, Multiply, Lambda, UpSampling1D
from keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

<h1>Traitement des données</h1>

In [224]:
# chargement des deux tables
real_news = pd.read_csv("./dataset1/True.csv")
fake_news = pd.read_csv("./dataset1/Fake.csv")

In [225]:
# vrai = 1, faux = 0
real_news["label"] = 1
fake_news["label"] = 0

In [226]:
# création du dataset complet
dataframe = pd.concat([real_news, fake_news])
dataframe.sample(10)

Unnamed: 0,title,text,subject,date,label
18089,WATCH SOLAR ECLIPSE LIVE HERE,If you don t have the proper glasses to watch ...,left-news,"Aug 21, 2017",0
7242,Over 60% of Republican Voters Are Embarrassed...,"In an article aptly titled, Why some Republic...",News,"March 28, 2016",0
3833,Russia election hacking a top global threat: D...,WASHINGTON (Reuters) - Russia’s attempts to in...,politicsNews,"May 11, 2017",1
20524,BLACK CONSERVATIVE Student DESTROYS Black Live...,"If you have the time, you should watch every m...",left-news,"May 25, 2016",0
2577,"As U.S. weighs Afghan strategy, hopes set on f...",KABUL (Reuters) - (This version of the July 23...,politicsNews,"July 23, 2017",1
9459,PRESIDENT TRUMP Makes Room Erupt in Laughter w...,President Trump spoke at the State Banquet dur...,politics,"Nov 11, 2017",0
17861,WHY ANTI-TRUMP BILLIONAIRE MARK CUBAN Couldn’t...,"During the 2016 presidential election, Mark Cu...",left-news,"Oct 4, 2017",0
10960,WHY LET THE ATHEISTS Run All Over You? City Re...,A city in Pennsylvania is removing a park benc...,politics,"May 6, 2017",0
16748,YPG fighters credit Ocalan with Syria victory,BEIRUT (Reuters) - Fighters with the Syrian Ku...,worldnews,"October 23, 2017",1
11241,BOOM! SEAN SPICER: “Trump Sold Hotels In Russi...,,politics,"Mar 31, 2017",0


In [227]:
print(f"Nombre de références : {dataframe.title.count()}")
print(f"Nombre de fake news : {fake_news.title.count()}")
print(f"Nombre de vraies news : {real_news.title.count()}")

Nombre de références : 44898
Nombre de fake news : 23481
Nombre de vraies news : 21417


In [228]:
# ici on ne s'intéresse qu'au titre et au label
del dataframe["title"]
del dataframe["subject"]
del dataframe["date"]

In [229]:
dataframe.sample(10)

Unnamed: 0,text,label
6156,When Donald Trump recently said he would love ...,0
11595,PARIS (Reuters) - France will on Thursday anno...,1
17338,LONDON (Reuters) - The probability that Britai...,1
3983,"WASHINGTON (Reuters) - An extra 2,500 visas fo...",1
11585,WASHINGTON (Reuters) - Russia is likely to mai...,1
16036,Are there any Trump donors on the team? GREAT...,0
2920,"What happens when the most vile, sexist chauvi...",0
18609,BARCELONA (Reuters) - Spanish national police ...,1
3526,Ever since Matt Lauer threw Donald Trump some ...,0
7829,(Reuters) - Republican presidential nominee Do...,1


In [234]:
# nettoyage des données (ponctuations)

#stopWords = set(stopwords.words("english"))

def cleanText(text):
    forbidden = {",","@",";","/","-",":",".","!","?", "#","\"","(",")","\'","’","‘","–","...","&"}
    res = text
    if res != None:
        for elm in forbidden:
            res = res.replace(elm, "")
    if len(res.split()) >= 30:
        res = " ".join(res.split()[0:30])
    if res != None:
        for elm in forbidden:
            res = res.replace("  ", " ")
    return res

dataframe["text"] = dataframe["text"].apply(cleanText)

#dataframe.sample(10)
cleanText("test...")

'test'

In [235]:
set(dataframe["text"].to_string(index=False).split())

{'milita...',
 'Vassil...',
 'larger',
 'mouthpiece',
 'Easter',
 'Prospects',
 'East...',
 'Xi...',
 'bridg...',
 'AB',
 'Sat...',
 '$120',
 'felon...',
 'Democratic',
 'wat...',
 'tv',
 'Kwambana',
 'chi...',
 'economically',
 'armoured',
 'Comma...',
 'exaggerate',
 'Dakotas',
 'DMI',
 'fed...',
 'ABORTION',
 'oneApparentl...',
 'aggressive...',
 'regi...',
 'Pastafarianism',
 'maj...',
 'shortly',
 'DIEGO',
 'forfeiture',
 'PRESIDENT',
 'Illin...',
 'Nusra',
 'lobby...',
 'Hassett',
 'minor',
 'DELHITOKYO',
 'Venezuelan',
 'sucks',
 'whack',
 'Handwriting',
 'Fina...',
 'Bol...',
 'Kentuck...',
 'logic',
 'Mi8',
 'ravings',
 'Fordham',
 'fed',
 'sole',
 'Twice',
 'globe',
 'Centu...',
 'buildin...',
 'subtl...',
 'proposa...',
 'COLBERT',
 'qualification',
 'Job...',
 'documentary',
 'babble',
 'shamefully',
 'establish...',
 'Jone...',
 'harassed',
 'embar...',
 'hereDuring',
 'explorer',
 'Pepe',
 'smack...',
 'Lemonis',
 'emphasis',
 'July',
 'privilege',
 'grateful',
 'suspend.

In [236]:
# on sépare les données en données d'entraînement et données de test (80% et 20%)
x_train, x_test, y_train, y_test = train_test_split(dataframe["text"], dataframe["label"], test_size=0.10, random_state = 42)
print(f"Données d'entrainement : {len(x_train)}")
print(f"Données de test : {len(x_test)}")

Données d'entrainement : 40408
Données de test : 4490


In [237]:
max_features = 20000 # taille max du vocab
maxlen = 30 # taille max de séquence

In [238]:
# vectorisation naïve en "one-hot"
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

In [239]:
# vectorisation des données d'entraînement
x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=maxlen)
y_train = np.array(y_train)

In [240]:
# vectorisation des données de test
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=maxlen)
y_test = np.array(y_test)

In [241]:
batch_size = 256
nb_epochs = 20
embedded_dim = 100
latent_dim = 100

In [242]:
def generate_latent_points(latent_dim, nbpoints):
    return np.random.uniform(0, 1, size=[nbpoints, latent_dim])

def getLatentSamples(latent_dim, n):
    labels = np.zeros(shape=n)
    samples = generate_latent_points(latent_dim, n)
    
    return samples, labels

def getFakeSamples(generator, latent_dim, n):
    labels = np.ones(shape=n)
    
    latent_points = generate_latent_points(latent_dim, n)
    samples = generator.predict(latent_points)
    
    return samples, labels

def getRealSamples(X, Y, n):
    random_indices = np.random.randint(0, X.shape[0], n)

    samples = X[random_indices]
    labels = Y[random_indices]
    
    return samples, labels

def generateFakeNews(model, n):
    few_points = generate_latent_points(latent_dim, n)
    predictions = generator.predict(few_points)
    fake_news = tokenizer.sequences_to_texts(np.round(predictions))
    
    return fake_news

In [243]:
def create_generator(dim):
    
    input_layer = Input(shape=[dim])
    
    x = Dense(maxlen, input_shape=[dim])(input_layer)
    
    x = Reshape((maxlen, 1))(x)
    x = Conv1D(128, 3, padding="same")(x)
    x = UpSampling1D()(x)
    x = Conv1D(64, 3, padding="same")(x)
    x = UpSampling1D()(x)
    x = Conv1D(32, 3, padding="same")(x)
    
    x = Flatten()(x)
    x = Dense(maxlen, activation="sigmoid")(x)
    
    output_layer = Lambda(lambda x: x * float(max_features))(x)

    model = Model(input_layer, output_layer)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    
    model.summary()
    
    return model

def create_discriminator():

    input_layer = Input(shape=[maxlen])
    
    #x = Embedding(max_features, output_dim=embedded_dim, input_length=maxlen, trainable=True, input_shape=[maxlen])(input_layer)
    x = Dense(embedded_dim * maxlen)(input_layer)
    x = Reshape((maxlen, embedded_dim))(x)
    x = Conv1D(64, 3, padding="same")(x)
    x = Dropout(0.1)(x)
    x = Conv1D(32, 3, padding="same")(x)
    x = Flatten()(x)
    
    output_layer = Dense(1, activation="sigmoid")(x)
    
    model = Model(input_layer, output_layer)
    
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    model.summary()
    
    return model

def create_gan(generator, discriminator, latent_dim):
    
    input_layer = Input(shape=[latent_dim])
    
    x = generator(input_layer)
    
    output_layer = discriminator(x)
    
    model = Model(input_layer, output_layer)

    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=0.0002, beta_1=.5))
    model.summary()
    
    return model

In [244]:
discriminator = create_discriminator()
generator = create_generator(latent_dim)
gan = create_gan(generator, discriminator, latent_dim)

Model: "model_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_34 (InputLayer)        (None, 30)                0         
_________________________________________________________________
dense_45 (Dense)             (None, 3000)              93000     
_________________________________________________________________
reshape_23 (Reshape)         (None, 30, 100)           0         
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 30, 64)            19264     
_________________________________________________________________
dropout_12 (Dropout)         (None, 30, 64)            0         
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 30, 32)            6176      
_________________________________________________________________
flatten_21 (Flatten)         (None, 960)               0  

In [245]:
def train(generator_model, discriminator_model, gan_model, nb_epochs, batch_size):
    discriminator_losses = []
    
    generator_losses = []
    
    for i in range(nb_epochs):

        x_fake, y_fake = getFakeSamples(generator_model, latent_dim, batch_size)
        x_real, y_real = getRealSamples(x_train, y_train, batch_size)
        
        discriminator_model.trainable = True

        loss1 = discriminator_model.train_on_batch(x_real, y_real)
        loss2 = discriminator_model.train_on_batch(x_fake, y_fake)
        loss3 = (loss1 + loss2) / 2
        
        discriminator_model.trainable = False
        
        x_gan, y_gan = getLatentSamples(latent_dim, batch_size)        
        
        loss_gan = gan_model.train_on_batch(x_gan, y_gan)

        discriminator_losses.append(loss3)
        
        generator_losses.append(loss_gan)
        
        print(f"Discriminator loss : {loss3}   ;    Generator loss : {loss_gan}", end="\r")
        if i % 1000 == 0:
            print(generateFakeNews(generator_model, 1))
        
    return discriminator_losses, generator_losses

In [246]:
DL, GL = train(generator, discriminator, gan, 50000, batch_size)

  'Discrepancy between trainable weights and collected trainable'


['fjsparentnodeinsertbeforejs deadlines streisand hiroshima firefighters emphasize pile bells patience forfeiture scotus glowing refinery friendship excuses ridehailing erupt swear reorganization chained vendors alberta blockbuster miner scenarios controlling cheney beholden blanket secondlargest']
['the decided many could week house reuters on as is us for of in for on trump that he reuters in trump that in and reuters as s that reuters']
['the investigation left century him be to to his it with in of reuters reuters a donald of donald on on that on on of of washington in of trump']
['would should president been a to the on on to of the the a the to to of a the to a to to the of to a a']
['first announced washington trump on to to of trump on of to a in a of of of in to a in to on a on a of of']
['during donald of tax to to the of a to to the the to to to a to to the to to a to to the to to the']
['s before on to on the the the the the the the the the the the the the the the the the t

In [None]:
few_points = generate_latent_points(latent_dim, 10)
predictions = generator.predict(few_points)
#print(predictions)
print(tokenizer.sequences_to_texts(np.round(predictions)))

In [None]:
# art plastique du turfu featuring le poto matplotlib
from matplotlib import pyplot as plt

In [None]:
figure, ax = plt.subplots(1, 2)
figure.set_size_inches(20,10)

ax[0].plot(DL)

ax[1].plot(GL)

plt.show()

In [None]:
to_predict = dataframe.sample(5)

In [None]:
X_test = to_predict["title"]
X_test

In [None]:
Y_test = to_predict["label"]
Y_test

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_test[0]

In [None]:
model.predict(X_test)

In [None]:
X_test

In [None]:
generator.save("generator.h5")
discriminator.save("discriminator.h5")
tokenizerJSON = tokenizer.to_json()

with open("gan_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizerJSON, ensure_ascii=False))