**Import libraries**

In [2]:
import math
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

**Default parameters**

In [3]:
MAX_SIZE = 399
NUM_WORDS = 1000
EMBEDDING_DIM = 16
EPOCHS = 10
BATCH_SIZE = 255
OOV = 1
SARCASM_TRAINING_SIZE = 20000

**Creating DataFrames**

In [4]:
goodreads_train = pd.read_csv("C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/initial/goodreads_train.csv")

In [5]:
goodreads_train.sample(5)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
358954,b104c4c0c9e1fbef82b4f94fa156524e,345627,3d9bef781340b06e4439a73d7c371def,3,3.5 stars \n Super fun adolescent vampire dram...,Sat May 26 13:34:24 -0700 2012,Fri Mar 11 18:27:48 -0800 2016,Thu Mar 10 00:00:00 -0800 2016,Mon Mar 07 00:00:00 -0800 2016,36,6
305386,414854ccbe367d9cb5a97d159b40cf2f,19501,7f11cc0f099f5488ce24d812a1c94076,0,"After reading the reviews for this book, it ju...",Thu Mar 25 19:48:34 -0700 2010,Thu Mar 25 19:48:58 -0700 2010,,,0,0
179053,c96ef78de7028e9ed7002356fb800a52,17245,415cfa19e48d2cbb6f08e7096487ec88,2,-Original Review- \n I bet those who read this...,Sun Sep 15 13:08:59 -0700 2013,Mon Jan 09 22:42:12 -0800 2017,,,0,0
214749,a143ad96df0ebd75024fc42c5a4c41f1,30194656,f0e740f43f1e82ea269ab39d935c3868,5,"5 "" Beyond Labels"" Stars \n It's official. Aly...",Wed Oct 14 17:56:07 -0700 2015,Fri May 27 02:23:36 -0700 2016,Thu May 19 00:00:00 -0700 2016,Thu May 19 00:00:00 -0700 2016,22,16
430736,6057d7eac9ffd0006c289e86a1f70236,28599180,d63e62b3223b403afe1ce52d2d3105cf,5,Absolutely awesome conclusion to this series. ...,Mon Feb 15 21:35:48 -0800 2016,Wed Apr 20 18:29:43 -0700 2016,Tue Apr 19 00:00:00 -0700 2016,Mon Apr 18 00:00:00 -0700 2016,0,0


**Cleaning Data**

In [6]:
train_df = goodreads_train.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated', 'read_at', 'started_at'],
                                axis=0)
train_df.head()

Unnamed: 0,review_id,rating,review_text,n_votes,n_comments
0,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,28,1
1,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,1,0
2,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",22,0
3,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,5,1
4,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",9,1


In [7]:
train_df.dtypes

review_id      object
rating          int64
review_text    object
n_votes         int64
n_comments      int64
dtype: object

In [8]:
y_train = train_df['rating']
x_train = train_df.drop('rating', axis=1)

In [9]:
x_train.head()

Unnamed: 0,review_id,review_text,n_votes,n_comments
0,dfdbb7b0eb5a7e4c26d59a937e2e5feb,This is a special book. It started slow for ab...,28,1
1,a5d2c3628987712d0e05c4f90798eb67,Recommended by Don Katz. Avail for free in Dec...,1,0
2,2ede853b14dc4583f96cf5d120af636f,"A fun, fast paced science fiction thriller. I ...",22,0
3,ced5675e55cd9d38a524743f5c40996e,Recommended reading to understand what is goin...,5,1
4,332732725863131279a8e345b63ac33e,"I really enjoyed this book, and there is a lot...",9,1


In [10]:
y_train.head()

0    5
1    3
2    3
3    0
4    4
Name: rating, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
x_train['review_id'] = le.fit_transform(x_train['review_id'])

In [13]:
x_train.head()

Unnamed: 0,review_id,review_text,n_votes,n_comments
0,786842,This is a special book. It started slow for ab...,28,1
1,583423,Recommended by Don Katz. Avail for free in Dec...,1,0
2,165147,"A fun, fast paced science fiction thriller. I ...",22,0
3,727692,Recommended reading to understand what is goin...,5,1
4,179941,"I really enjoyed this book, and there is a lot...",9,1


**NLP**

In [14]:
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV)

In [15]:
def get_sequences(tokenizer, review):
    sequences = tokenizer.texts_to_sequences(review)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=MAX_SIZE, padding='post')
    return padded_sequences

In [16]:
def tokenizer_func(data_rating, data_review):
    tokenizer.fit_on_texts(data_review)

    train_labels = data_rating.iloc[math.floor(int(len(data_rating) / 8)):]
    train_examples = data_review.iloc[math.floor(int(len(data_review) / 8)):]
    test_examples = data_review.iloc[:math.floor(int(len(data_review) / 8))]
    test_labels = data_rating.iloc[:math.floor(int(len(data_rating) / 8))]

    padded_train = get_sequences(tokenizer, train_examples)
    padded_test = get_sequences(tokenizer, test_examples)

    return np.array(padded_train), np.array(padded_test), np.array(train_labels), np.array(test_labels)

In [17]:
padded_train, padded_test, train_labels, test_labels = tokenizer_func(y_train, x_train['review_text'])

In [18]:
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/padded_train.npy', padded_train)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/padded_test.npy', padded_test)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/train_labels.npy', train_labels)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/test_labels.npy', test_labels)

In [19]:
# padded_train = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/padded_train.npy')
# padded_test = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/padded_test.npy')
# train_labels = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/train_labels.npy')
# test_labels = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/tokenizer_func/test_labels.npy')

In [20]:
padded_train[0]

array([  4,  12, 294,  16,   6,   1, 566,  31,   3,   9,  10, 456,  42,
         4, 151,  88, 597,   1,  59,  10,   6,   1,   7,   6,   1,   1,
        21,  10, 635,  16,   2,   1,   1, 459,  45,  34, 185,   1,  27,
         6, 428,   5,   1,   2, 223,   7,   2, 888,   1,   1, 388,  42,
        80,   4, 134,  47,   1, 362,   3, 566,  32,  42,   4,  12, 294,
        16,   4,  91,  37, 291,  13,  14,  15,   9, 167, 209, 124, 207,
        25, 132,  48, 156, 290,   3,  84,   1, 255, 111, 383, 242, 124,
         3,  42, 180,   7, 460,  10,   1,   1,  24,   2, 434,  70, 914,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

**Sarcasm detection**

In [21]:
data = pd.read_json('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/sarcasm/Sarcasm_Headlines_Dataset.json', lines=True)

In [22]:
# Iterating through the json data and loading the requisite values into our python lists
sentences = data['headline']
labels = data['is_sarcastic']

In [23]:
training_sentences = sentences[0:SARCASM_TRAINING_SIZE]
testing_sentences = sentences[SARCASM_TRAINING_SIZE:]

In [24]:
training_labels = labels[0:SARCASM_TRAINING_SIZE]
testing_labels = labels[SARCASM_TRAINING_SIZE:]

In [25]:
tokenizer.fit_on_texts(training_sentences)

In [26]:
# Creating training sequences and padding them
training_padded = get_sequences(tokenizer, training_sentences)
testing_padded = get_sequences(tokenizer, testing_sentences)

In [27]:
# Converting all variables to numpy arrays, to be able to work with tf version 2
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [28]:
# Creating a model for sentiment analysis
sarcasm_model = tf.keras.Sequential([
    # Adding an Embedding layer for Neural Network to learn the vectors
    tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SIZE),
    # Global Average pooling is similar to adding up vectors in this case
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [29]:
sarcasm_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

In [30]:
sarcasm_model.fit(training_padded, training_labels, epochs=EPOCHS,
                    validation_data=(testing_padded, testing_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bc9cee7b50>

In [31]:
sarcasm_prediction_train = sarcasm_model.predict(padded_train)
sarcasm_prediction_test = sarcasm_model.predict(padded_test)



In [32]:
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/sarcasm_model/sarcasm_prediction_train.npy', sarcasm_prediction_train)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/sarcasm_model/sarcasm_prediction_test.npy', sarcasm_prediction_test)

In [33]:
# sarcasm_prediction_train = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/sarcasm_model/sarcasm_prediction_train.npy')
# sarcasm_prediction_test = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/sarcasm_model/sarcasm_prediction_test.npy')

**Reshaping data**


In [34]:
sarcasm_prediction_train.shape

(787500, 1)

In [35]:
padded_train.shape

(787500, 399)

In [36]:
padded_train = np.concatenate((padded_train, np.array(sarcasm_prediction_train.flatten())[:, None]), axis=1)
padded_test = np.concatenate((padded_test, np.array(sarcasm_prediction_test.flatten())[:, None]), axis=1)

In [38]:
padded_train.shape

(787500, 400)

In [39]:
padded_train = np.reshape(padded_train, (1 - math.floor(len(y_train) / 8),int(math.sqrt(MAX_SIZE + 1)), int(math.sqrt(MAX_SIZE + 1))))
padded_test = np.reshape(padded_test, (math.floor(int(len(x_train['review_text']) / 8)), int(math.sqrt(MAX_SIZE + 1)), int(math.sqrt(MAX_SIZE + 1))))

In [None]:
# padded_train = np.reshape(padded_train, (1 - math.floor(int(len(label) / 8)), int(math.sqrt(MAX_SIZE + 1)), int(math.sqrt(MAX_SIZE + 1))))
# padded_test = np.reshape(padded_test, (math.floor(int(len(label) / 8)), int(math.sqrt(MAX_SIZE + 1)), int(math.sqrt(MAX_SIZE + 1))))

In [40]:
# padded_train[0]
padded_train.shape

(787500, 20, 20)

In [41]:
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/test0.npy', padded_train)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/test1.npy', padded_test)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/test2.npy', train_labels)
np.save('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/test3.npy', test_labels)

In [None]:
# padded_train = np.load('C:/Users/tomcareghi/Documents/ESGI/4IABD/S1/Deep_Learning/DeepLearning4IABD/src/data/test.npy')

In [42]:
# CNN
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.MaxPool2D())

model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.MaxPool2D())

model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
model.add(tf.keras.layers.MaxPool2D())

# model.add(tf.keras.layers.Conv2D(64, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
# model.add(tf.keras.layers.Conv2D(64, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
# model.add(tf.keras.layers.MaxPool2D())


model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(64, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
model.add(tf.keras.layers.Dense(32, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
model.add(tf.keras.layers.Dense(16, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
model.add(tf.keras.layers.Dense(6,
                                activation=tf.keras.activations.softmax))  # model.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.softmax))

model.compile(optimizer=tf.keras.optimizers.SGD(0.1, momentum=0.1),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.categorical_accuracy])


In [45]:
padded_train = padded_train / NUM_WORDS
padded_test = padded_test / NUM_WORDS
# print(padded_train.shape)
train_labels = tf.keras.utils.to_categorical(train_labels, 6)
# print(padded_train.shape)
test_labels = tf.keras.utils.to_categorical(test_labels, 6)

padded_train = np.expand_dims(padded_train, -1)
padded_test = np.expand_dims(padded_test, -1)
model.fit(
    # padded_train, donner un dataset avec .batch_size tfrecord
    # train_labels,
    epochs=10,
    callbacks=[tf.keras.callbacks.TensorBoard("tensorboard" + "/trash3/")],
    validation_data=(padded_test, test_labels),
    verbose=1,
    # batch_size=1024
)
# keras tuner

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.