**Import libraries**

In [1]:
import math
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

**Default parameters**

In [2]:
MAX_SIZE = 783
NUM_WORDS = 1000
EMBEDDING_DIM = 16
EPOCHS = 10
BATCH_SIZE = 512
OOV = 0
SARCASM_TRAINING_SIZE = 20000

**Creating DataFrames**

In [3]:
goodreads_train = pd.read_csv("kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv")
goodreads_test = pd.read_csv("kaggle/input/goodreads-books-reviews-290312/goodreads_test.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv'

In [None]:
goodreads_train.sample(5)

In [None]:
goodreads_test.sample(5)

**Cleaning Data**

In [None]:
train_df = goodreads_train.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated', 'read_at', 'started_at'],
                                axis=0)
train_df.head()

In [None]:
train_df.dtypes

In [None]:
y_train = train_df['rating']
x_train = train_df.drop('rating', axis=1)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
x_train['review_id'] = le.fit_transform(x_train['review_id'])

In [13]:
# x_train.head()
x_train[0]

Unnamed: 0,review_id,review_text,n_votes,n_comments
0,786842,This is a special book. It started slow for ab...,28,1
1,583423,Recommended by Don Katz. Avail for free in Dec...,1,0
2,165147,"A fun, fast paced science fiction thriller. I ...",22,0
3,727692,Recommended reading to understand what is goin...,5,1
4,179941,"I really enjoyed this book, and there is a lot...",9,1


**NLP**

In [14]:
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV)

In [15]:
def get_sequences(tokenizer, review):
    sequences = tokenizer.texts_to_sequences(review)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=MAX_SIZE, padding='post')
    return padded_sequences

In [16]:
def tokenizer_func(data_rating, data_review):
    tokenizer.fit_on_texts(data_review)

    train_labels = data_rating.iloc[math.floor(int(len(data_rating) / 8)):]
    train_examples = data_review.iloc[math.floor(int(len(data_review) / 8)):]
    test_examples = data_review.iloc[:math.floor(int(len(data_review) / 8))]
    test_labels = data_rating.iloc[:math.floor(int(len(data_rating) / 8))]

    padded_train = get_sequences(tokenizer, train_examples)
    padded_test = get_sequences(tokenizer, test_examples)

    return np.array(padded_train), np.array(padded_test), np.array(train_labels), np.array(test_labels)

In [17]:
padded_train, padded_test, train_labels, test_labels = tokenizer_func(y_train, x_train['review_text'])

In [18]:
padded_train[0]

array([  4,  12, 294,  16,   6,   1, 566,  31,   3,   9,  10, 456,  42,
         4, 151,  88, 597,   1,  59,  10,   6,   1,   7,   6,   1,   1,
        21,  10, 635,  16,   2,   1,   1, 459,  45,  34, 185,   1,  27,
         6, 428,   5,   1,   2, 223,   7,   2, 888,   1,   1, 388,  42,
        80,   4, 134,  47,   1, 362,   3, 566,  32,  42,   4,  12, 294,
        16,   4,  91,  37, 291,  13,  14,  15,   9, 167, 209, 124, 207,
        25, 132,  48, 156, 290,   3,  84,   1, 255, 111, 383, 242, 124,
         3,  42, 180,   7, 460,  10,   1,   1,  24,   2, 434,  70, 914,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

**Sarcasm detection**

In [19]:
data = pd.read_json('kaggle/input/sarcasmjson/sarcasm.json', lines=True)

In [20]:
# Iterating through the json data and loading the requisite values into our python lists
sentences = data['headline']
labels = data['is_sarcastic']

In [21]:
training_sentences = sentences[0:SARCASM_TRAINING_SIZE]
testing_sentences = sentences[SARCASM_TRAINING_SIZE:]

In [22]:
training_labels = labels[0:SARCASM_TRAINING_SIZE]
testing_labels = labels[SARCASM_TRAINING_SIZE:]

In [23]:
tokenizer.fit_on_texts(training_sentences)

In [24]:
# Creating training sequences and padding them
training_padded = get_sequences(tokenizer, training_sentences)
testing_padded = get_sequences(tokenizer, testing_sentences)

In [25]:
# Converting all variables to numpy arrays, to be able to work with tf version 2
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [26]:
# Creating a model for sentiment analysis
sarcasm_model = tf.keras.Sequential([
    # Adding an Embedding layer for Neural Network to learn the vectors
    tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SIZE),
    # Global Average pooling is similar to adding up vectors in this case
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [27]:
sarcasm_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

In [28]:
sarcasm_model.fit(training_padded, training_labels, epochs=EPOCHS,
                    validation_data=(testing_padded, testing_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ce9e90ff10>

In [29]:
sarcasm_prediction_train = sarcasm_model.predict(padded_train)
sarcasm_prediction_test = sarcasm_model.predict(padded_test)



**Reshaping data**


In [30]:
sarcasm_prediction_train.shape

(787500, 1)

In [31]:
padded_train.shape

(787500, 783)

In [32]:
padded_train = np.concatenate((padded_train, np.array(sarcasm_prediction_train.flatten())[:, None]), axis=1)
padded_test = np.concatenate((padded_test, np.array(sarcasm_prediction_test.flatten())[:, None]), axis=1)

In [33]:
padded_train.shape

(787500, 784)

In [34]:
padded_train = np.reshape(padded_train, (1 - math.floor(len(y_train) / 8), 28, 28))
padded_test = np.reshape(padded_test, (math.floor(int(len(x_train['review_text']) / 8)), 28, 28))

In [35]:
padded_train[0]

array([[4.0000000e+00, 1.2000000e+01, 2.9400000e+02, 1.6000000e+01,
        6.0000000e+00, 1.0000000e+00, 5.6600000e+02, 3.1000000e+01,
        3.0000000e+00, 9.0000000e+00, 1.0000000e+01, 4.5600000e+02,
        4.2000000e+01, 4.0000000e+00, 1.5100000e+02, 8.8000000e+01,
        5.9700000e+02, 1.0000000e+00, 5.9000000e+01, 1.0000000e+01,
        6.0000000e+00, 1.0000000e+00, 7.0000000e+00, 6.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 2.1000000e+01, 1.0000000e+01],
       [6.3500000e+02, 1.6000000e+01, 2.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 4.5900000e+02, 4.5000000e+01, 3.4000000e+01,
        1.8500000e+02, 1.0000000e+00, 2.7000000e+01, 6.0000000e+00,
        4.2800000e+02, 5.0000000e+00, 1.0000000e+00, 2.0000000e+00,
        2.2300000e+02, 7.0000000e+00, 2.0000000e+00, 8.8800000e+02,
        1.0000000e+00, 1.0000000e+00, 3.8800000e+02, 4.2000000e+01,
        8.0000000e+01, 4.0000000e+00, 1.3400000e+02, 4.7000000e+01],
       [1.0000000e+00, 3.6200000e+02, 3.000000

**CNN Training**

In [36]:
cnn_model = tf.keras.models.Sequential()

In [37]:
cnn_model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Flatten())
cnn_model.add(tf.keras.layers.Dense(32, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
cnn_model.add(tf.keras.layers.Dense(16, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
cnn_model.add(tf.keras.layers.Dense(6,
                                    activation=tf.keras.activations.softmax))  # model.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.softmax))

In [38]:
cnn_model.compile(optimizer=tf.keras.optimizers.SGD(0.1, momentum=0.9),
                  loss=tf.keras.losses.categorical_crossentropy,
                  metrics=[tf.keras.metrics.categorical_accuracy])

In [39]:
cnn_model.fit(padded_train,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS
              )

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

**Testing model**

In [None]:
padded_train = padded_train / 255.0
padded_test = padded_test / 255.0

train_labels = tf.keras.utils.to_categorical(train_labels, 6)
test_labels = tf.keras.utils.to_categorical(test_labels, 6)

padded_train = np.expand_dims(padded_train, -1)
padded_test = np.expand_dims(padded_test, -1)

cnn_predict = cnn_model.predict(padded_train)

In [None]:
cnn_model.summary()

**Submission**

In [None]:
sample_submission = pd.read_csv("../input/goodreads-books-reviews-290312/goodreads_sample_submission.csv")
preds = np.argmax(cnn_predict, axis=1)
sample_submission.rating = preds
sample_submission.to_csv("submission.csv", index=False)