In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# load all necessary libraries
import pandas as pd
import numpy as np # linear algebra

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection  import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


from gensim.models.word2vec import Word2Vec
from gensim import models
from gensim.models import KeyedVectors


from sklearn.linear_model import LogisticRegression
from numpy import expand_dims
from numpy import zeros
from numpy import ones
from numpy.random import randn
from numpy.random import randint
from keras.datasets.fashion_mnist import load_data
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Conv2D,Lambda,BatchNormalization,Activation
from keras.layers import Conv1D
from keras.layers import Conv2DTranspose
from keras.layers import LeakyReLU
from keras.layers import Dropout
import keras.backend as K

import tensorflow as tf
from keras.preprocessing import sequence

from keras.models import load_model
from numpy.random import randn
from matplotlib import pyplot
from keras.layers import Input,MaxPool1D
from keras.models import Model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [7]:
seq_len = max_len = 1024 #512
b_size = 128
n_chan = 10
f_size = 3 # filter size


In [8]:

data = pd.read_csv("./../deceptive-opinion.csv")
data = data.loc[:,['text','deceptive']]

# stem messages
#messages = [preprocess(message, stem=True) for message in data.text]
data.text = data.text.apply(lambda message : preprocess(message, stem=False))
data['deceptive'] = data.deceptive.map({'truthful':1, 'deceptive':0})

X = data.text
y = data.deceptive
#X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.30,random_state=1)


df_train, df_test= train_test_split(data,  test_size = 0.20,random_state=1)
df_truthful = df_train.loc[df_train.deceptive == 1,:]
X_train = df_truthful.text
X_test=df_test.text
y_train= df_truthful.deceptive
y_test=df_test.deceptive

print('Training set size : ', (X_train.shape[0]))
print('Test set size : ', (X_test.shape[0]))




tfidf_model = TfidfVectorizer()
X_train_tfidf = tfidf_model.fit_transform(X_train)
X_test_tfidf = tfidf_model.transform(X_test)

Training set size :  649
Test set size :  320


In [9]:
X_train_tfidf.shape,X_test_tfidf.shape

((649, 5395), (320, 5395))

In [10]:
print(X_train_tfidf[0].toarray())


[[0. 0. 0. ... 0. 0. 0.]]


In [11]:
vocab_size = X_train_tfidf.shape[1]

In [12]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [13]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(X_train_tfidf.toarray(), maxlen=max_len)
x_test = sequence.pad_sequences(X_test_tfidf.toarray(), maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (649, 1024)
x_test shape: (320, 1024)


In [14]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [15]:

def define_discriminator(in_shape=(max_len,1)):

    D = Sequential()
    D.add(Conv1D(n_chan,f_size,activation='relu',input_shape = (seq_len,1)))
    D.add(Flatten())
    D.add(Dense(1, activation='sigmoid'))
    D.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])
    return D



In [16]:
# define the standalone generator model
def define_generator(latent_dim):
    def Conv1DTranspose(inp,nf,ks,s=2,p='same'):
        x1 = Lambda(lambda x : K.expand_dims(x,axis=2))(inp)
        x2 = Conv2DTranspose(filters=nf,kernel_size=(ks,1),strides=(s,1),padding=p)(x1)
        return Lambda(lambda x :K.squeeze(x,axis=2))(x2)

    G = Sequential()
    G.add(Dense(int(seq_len/8)*n_chan,input_shape=(latent_dim,)))
    G.add(Reshape((int(seq_len/8),n_chan)))
    G.add(BatchNormalization(momentum= 0.8,epsilon=1.e-5))
    for i in range(0,2):
        G.add(Lambda(lambda x : Conv1DTranspose(x,n_chan,f_size)))
        G.add(BatchNormalization(momentum= 0.8,epsilon=1.e-5))

    G.add(Lambda(lambda x : Conv1DTranspose(x,1,3)))
    G.add(Activation('sigmoid'))
    G.summary()
    return G



In [17]:
# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    opt = Adam(lr=0.0002, beta_1=0.5)
    model.compile(loss='binary_crossentropy', optimizer=opt)
    return model

In [18]:
# load Dara
def load_real_samples():
    # load dataset
    (trainX, _), (_, _) = (x_train,y_train),(x_test,y_test)#load_data()

    return trainX

In [19]:
# select real samples
def generate_real_samples(dataset, n_samples):
    # choose random instances
    ix = randint(0, dataset.shape[0], n_samples)
    # select data
    X = dataset[ix]
    # generate class labels
    y = ones((n_samples, 1))
    return X, y

In [20]:
# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples):
    # generate points in the latent space
    x_input = randn(latent_dim * n_samples)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input

    # use the generator to generate n fake examples, with class labels
def generate_fake_samples(generator, latent_dim, n_samples):
    # generate points in latent space
    x_input = generate_latent_points(latent_dim, n_samples)
    # predict outputs
    X = generator.predict(x_input)
    # create class labels
    y = zeros((n_samples, 1))
    return X, y

In [21]:
def plot_history(d1_hist, d2_hist, g_hist):
    # plot history
    pyplot.title("GAN+TFIDF")
    pyplot.plot(d1_hist, label='disc_real')
    pyplot.plot(d2_hist, label='disc_fake')
    pyplot.plot(g_hist, label='gen')
    pyplot.legend()
    pyplot.savefig('GAN_TFIDF_line_plot_loss.png')
    pyplot.close()

In [22]:
# train the generator and discriminator
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs=20, n_batch=128):
    bat_per_epo = int(dataset.shape[0] / n_batch)
    half_batch = int(n_batch / 2)
    # manually enumerate epochs
    c1_hist, c2_hist, g_hist = list(), list(), list()
    for i in range(n_epochs):
        # enumerate batches over the training set
        for j in range(bat_per_epo):
            # get randomly selected 'real' samples
            X_real, y_real = generate_real_samples(dataset, half_batch)
            # update discriminator model weights
            d_loss1, _ = d_model.train_on_batch(X_real, y_real)
            # generate 'fake' examples
            X_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
            
       
            # update discriminator model weights
            d_loss2, _ = d_model.train_on_batch(X_fake, y_fake)
            # prepare points in latent space as input for the generator
            X_gan = generate_latent_points(latent_dim, n_batch)
            # create inverted labels for the fake samples
            y_gan = ones((n_batch, 1))
            # update the generator via the discriminator's error
            g_loss = gan_model.train_on_batch(X_gan, y_gan)
            # summarize loss on this batch
            print('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %
            (i+1, j+1, bat_per_epo, d_loss1, d_loss2, g_loss))
            c1_hist.append(d_loss1)
            c2_hist.append(d_loss2)
            g_hist.append(g_loss)
            
            #print(X_real.shape,X_fake.shape)
            
    # save the generator model
    #g_model.save('generator.h5')
    #d_model.save('dis_generator.h5')
    plot_history(c1_hist, c2_hist, g_hist)
    return g_model,d_model

In [23]:
# size of the latent space
latent_dim = 100
# create the discriminator
discriminator = define_discriminator()
# create the generator
generator = define_generator(latent_dim)
# create the gan
gan_model = define_gan(generator, discriminator)
# load data
dataset = load_real_samples()
#print("shape -->",dataset.shape)
dataset = np.reshape(dataset,(dataset.shape[0],dataset.shape[1],1))

# train model
model,d_model= train(generator, discriminator, gan_model, dataset, latent_dim)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1280)              129280    
_________________________________________________________________
reshape_1 (Reshape)          (None, 128, 10)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 10)           40        
_________________________________________________________________
lambda_1 (Lambda)            (None, 256, 10)           0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 256, 10)           40        
_________________________________________________________________
lambda_6 (Lambda)            (None, 512, 10)           0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 512, 10)           40        
__________

  'Discrepancy between trainable weights and collected trainable'


>1, 1/5, d1=0.693, d2=0.622 g=1.107
>1, 2/5, d1=0.694, d2=0.268 g=1.905
>1, 3/5, d1=0.693, d2=0.097 g=2.832
>1, 4/5, d1=0.693, d2=0.037 g=3.704
>1, 5/5, d1=0.695, d2=0.015 g=4.471
>2, 1/5, d1=0.692, d2=0.007 g=5.124
>2, 2/5, d1=0.691, d2=0.004 g=5.674
>2, 3/5, d1=0.690, d2=0.002 g=6.129
>2, 4/5, d1=0.690, d2=0.002 g=6.528
>2, 5/5, d1=0.689, d2=0.001 g=6.852
>3, 1/5, d1=0.689, d2=0.001 g=7.123
>3, 2/5, d1=0.688, d2=0.001 g=7.356
>3, 3/5, d1=0.687, d2=0.000 g=7.551
>3, 4/5, d1=0.687, d2=0.000 g=7.703
>3, 5/5, d1=0.686, d2=0.000 g=7.851
>4, 1/5, d1=0.685, d2=0.000 g=7.961
>4, 2/5, d1=0.685, d2=0.000 g=8.056
>4, 3/5, d1=0.684, d2=0.000 g=8.124
>4, 4/5, d1=0.683, d2=0.000 g=8.199
>4, 5/5, d1=0.683, d2=0.000 g=8.249
>5, 1/5, d1=0.682, d2=0.000 g=8.302
>5, 2/5, d1=0.681, d2=0.000 g=8.324
>5, 3/5, d1=0.681, d2=0.000 g=8.363
>5, 4/5, d1=0.680, d2=0.000 g=8.376
>5, 5/5, d1=0.679, d2=0.000 g=8.402
>6, 1/5, d1=0.679, d2=0.000 g=8.423
>6, 2/5, d1=0.678, d2=0.000 g=8.433
>6, 3/5, d1=0.677, d2=0.000 