In [17]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# load all necessary libraries
import pandas as pd
import numpy as np # linear algebra
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection  import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


from gensim.models.word2vec import Word2Vec
from gensim import models
from gensim.models import KeyedVectors


from sklearn.linear_model import LogisticRegression
from numpy import expand_dims
from numpy import zeros
from numpy import ones
from numpy.random import randn
from numpy.random import randint
from keras.datasets.fashion_mnist import load_data
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Conv2D,Lambda,BatchNormalization,Activation
from keras.layers import Conv1D
from keras.layers import Conv2DTranspose
from keras.layers import LeakyReLU
from keras.layers import Dropout
from keras.layers import multiply
from keras.models import Model
import keras.backend as K
from matplotlib import pyplot
import tensorflow as tf
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Concatenate

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
def evaluationStats(y_test,y_pred_class,y_pred_proba):
    print("ACCURACY        :",metrics.accuracy_score(y_test, y_pred_class))
    print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
    print("RECALL SCORE    :", metrics.recall_score(y_test, y_pred_class))
    print("F1 SCORE        :",metrics.f1_score(y_test, y_pred_class))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # area under the curve
    print ("AUC SCORE      :",roc_auc)
    %matplotlib inline  
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate)

In [19]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [20]:

seq_len = max_len = 1024
b_size = 128
n_chan = 10
f_size = 3 # filter size
img_shape = (1024, 1)
z_dim = 1024
num_classes = 2

In [21]:

data = pd.read_csv("./../deceptive-opinion.csv")
data = data.loc[:,['text','deceptive']]

# stem messages
#messages = [preprocess(message, stem=True) for message in data.text]
data.text = data.text.apply(lambda message : preprocess(message, stem=False))
data['deceptive'] = data.deceptive.map({'truthful':1, 'deceptive':0})

X = data.text
y = data.deceptive
#X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.20,random_state=1)


df_train, df_test= train_test_split(data,  test_size = 0.20,random_state=1)
df_truthful = df_train.loc[df_train.deceptive == 1,:]
X_train = df_truthful.text
X_test=df_test.text
y_train= df_truthful.deceptive
y_test=df_test.deceptive

print('Training set size : ', (X_train.shape[0]))
print('Test set size : ', (X_test.shape[0]))

bow_model = CountVectorizer(stop_words='english',max_features=max_len)

X_train_bow = bow_model.fit_transform(X_train)
X_test_bow = bow_model.transform(X_test)

Training set size :  649
Test set size :  320


In [22]:
X_train_bow.shape,X_test_bow.shape

((649, 1024), (320, 1024))

In [23]:
print(X_train_bow[0].toarray())

[[0 0 0 ... 0 1 0]]


In [24]:
vocab_size = X_train_bow.shape[1]

In [25]:
from keras.preprocessing import sequence



print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(X_train_bow.toarray(), maxlen=max_len)
x_test = sequence.pad_sequences(X_test_bow.toarray(), maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (649, 1024)
x_test shape: (320, 1024)


In [26]:

def load_real_samples():
    # load dataset
    return (x_train,y_train),(x_test,y_test)

## Discriminator

In [27]:
# define the standalone discriminator model
def define_discriminator(in_shape=(max_len,1), n_classes=2):
# in_shape=(max_len,1) 
# n_classes=2
    img = Input(shape=in_shape)

    label = Input(shape=(1,), dtype='int32')

    # embedding layer:
    # turns labels into dense vectors 
    # produces 3D tensor 
    label_embedding = Embedding(input_dim=n_classes, output_dim=np.prod(in_shape), input_length=1)(label)
    # Flatten the embedding 3D tensor into 2D  tensor 
    label_embedding = Flatten()(label_embedding)
    # Reshape label embeddings to have same dimensions as input data
    label_embedding = Reshape(img_shape)(label_embedding)

    # concatenate data with corresponding label embeddings
    concatenated = Concatenate(axis=-1)([img, label_embedding])

    print(concatenated.shape)


    D = Sequential()
    print(concatenated.shape)

    D.add(Conv1D(n_chan,f_size,activation='relu',input_shape = (seq_len,2)))
    D.add(Flatten())
    D.add(Dense(1, activation='sigmoid'))
    D.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])
    D.summary()

    model = D(concatenated)
    return Model([img, label], model)


 
 
 


## Generator

In [28]:
# define the standalone generator model
def define_generator(latent_dim, n_classes=2):

    def Conv1DTranspose(inp,nf,ks,s=2,p='same'):
        x1 = Lambda(lambda x : K.expand_dims(x,axis=2))(inp)
        x2 = Conv2DTranspose(filters=nf,kernel_size=(ks,1),strides=(s,1),padding=p)(x1)
        return Lambda(lambda x :K.squeeze(x,axis=2))(x2)
    
    z_dim = latent_dim
    
    z = Input(shape=(z_dim, ))
    
    # Conditioning label
    label = Input(shape=(1,), dtype='int32')
    
    # embedding layer:

    label_embedding = Embedding(n_classes, z_dim, input_length=1)(label)
    
    # Flatten the embedding 3D tensor into 2D  tensor with shape: (batch_size, z_dim)
    label_embedding = Flatten()(label_embedding)
    
    # Element-wise product of the vectors z and the label embeddings
    joined_representation = multiply([z, label_embedding])

    G = Sequential()
    G.add(Dense(int(seq_len/8)*n_chan,input_shape=(latent_dim,)))
    G.add(Reshape((int(seq_len/8),n_chan)))
    G.add(BatchNormalization(momentum= 0.8,epsilon=1.e-5))
    for i in range(0,2):
        G.add(Lambda(lambda x : Conv1DTranspose(x,n_chan,f_size)))
        G.add(BatchNormalization(momentum= 0.8,epsilon=1.e-5))

    G.add(Lambda(lambda x : Conv1DTranspose(x,1,3)))
    G.add(Activation('sigmoid'))
    G.summary()
    
    model = G(joined_representation)
    
    return Model([z, label], model)

    

 


## Define GAN

In [29]:
disc = define_discriminator()
disc.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam())

# build the generator
gen = define_generator(z_dim)

# the generator takes noise and the target label as input
# and generates the corresponding digit for that label
z = Input(shape=(z_dim,))
label = Input(shape=(1,))

img = gen([z, label])

# keep the discriminator's params constant for generator training
disc.trainable = False

prediction = disc([img, label])

# Conditional (Conditional) GAN model with fixed discriminator to train the generator
cgan = Model([z, label], prediction)
cgan.compile(loss='binary_crossentropy', optimizer=Adam())

(?, 1024, 2)
(?, 1024, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 1022, 10)          70        
_________________________________________________________________
flatten_5 (Flatten)          (None, 10220)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 10221     
Total params: 10,291
Trainable params: 10,291
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1280)              1312000   
_________________________________________________________________
reshape_4 (Reshape)          (None, 128, 10)           0         
______________________________________________

In [30]:
def plot_history(d1_hist, d2_hist, g_hist):
    # plot history
    pyplot.title("CGAN+BOW")
    pyplot.plot(d1_hist, label='disc_real')
    pyplot.plot(d2_hist, label='disc_fake')
    pyplot.plot(g_hist, label='gen')
    pyplot.legend()
    pyplot.savefig('CGAN_BOW_line_plot_loss.png')
    pyplot.close()

In [31]:
accuracies = []
losses = []

def train(iterations, batch_size, sample_interval):
    
    (X_train, y_train), (_, _) = load_real_samples()

    real = np.ones(shape=(batch_size, 1))
    fake = np.zeros(shape=(batch_size, 1))
    c1_hist, c2_hist, g_hist = list(), list(), list()
    for iteration in range(iterations):

        idx = np.random.randint(0, X_train.shape[0], batch_size)
        
        imgs, labels = X_train[idx], np.array(y_train)[idx]

        
        z = np.random.normal(0, 1, size=(batch_size, z_dim))
        
        gen_imgs = gen.predict([z, labels])
        #print(gen_imgs.shape)
        
        d_loss_real = disc.train_on_batch([imgs.reshape(128,max_len,1), labels], real)
        d_loss_fake = disc.train_on_batch([gen_imgs, labels], fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        z = np.random.normal(0, 1, size=(batch_size, z_dim))
        labels = np.random.randint(0, num_classes, batch_size).reshape(-1, 1)
        
        g_loss = cgan.train_on_batch([z, labels], real)
        
        c1_hist.append(d_loss_real[0])
        c2_hist.append(d_loss_fake[0])
        g_hist.append(g_loss)
        
        if iteration % sample_interval == 0:
            print('{} [D loss: {}, accuracy: {:.2f}] [D1 loss: {}][D2 loss: {}][G loss: {}]'.format(iteration, d_loss[0], 100 * d_loss[1],d_loss_real,d_loss_fake, g_loss))
        
            losses.append((d_loss[0], g_loss))
            accuracies.append(d_loss[1])
            

            
    plot_history(c1_hist, c2_hist, g_hist)
    

In [32]:
iterations = 20
batch_size = 128
sample_interval = 1#1000

train(iterations, batch_size, sample_interval)

  'Discrepancy between trainable weights and collected trainable'


0 [D loss: 0.9367610812187195, accuracy: 40.23] [D1 loss: [0.6604016, 0.8046875]][D2 loss: [1.2131206, 0.0]][G loss: 0.5239591002464294]
1 [D loss: 0.6606132984161377, accuracy: 70.31] [D1 loss: [0.61895496, 0.984375]][D2 loss: [0.7022717, 0.421875]][G loss: 0.9475960731506348]
2 [D loss: 0.49063193798065186, accuracy: 97.27] [D1 loss: [0.6383525, 0.9453125]][D2 loss: [0.34291136, 1.0]][G loss: 1.5195965766906738]
3 [D loss: 0.4054676592350006, accuracy: 93.36] [D1 loss: [0.6480713, 0.8671875]][D2 loss: [0.16286403, 1.0]][G loss: 2.13370418548584]
4 [D loss: 0.3647836148738861, accuracy: 92.97] [D1 loss: [0.647835, 0.859375]][D2 loss: [0.08173223, 1.0]][G loss: 2.659886121749878]
5 [D loss: 0.33868199586868286, accuracy: 94.92] [D1 loss: [0.6317525, 0.8984375]][D2 loss: [0.045611504, 1.0]][G loss: 3.0893685817718506]
6 [D loss: 0.3197418451309204, accuracy: 94.92] [D1 loss: [0.61070216, 0.8984375]][D2 loss: [0.02878156, 1.0]][G loss: 3.442274808883667]
7 [D loss: 0.28733816742897034, a