**Import libraries**

In [None]:
import math
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

**Default parameters**

In [None]:
MAX_SIZE = 783
NUM_WORDS = 1000
EMBEDDING_DIM = 16
EPOCHS = 10
BATCH_SIZE = 512
OOV = 0
SARCASM_TRAINING_SIZE = 20000

**Creating DataFrames**

In [None]:
goodreads_train = pd.read_csv("kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv")
goodreads_test = pd.read_csv("kaggle/input/goodreads-books-reviews-290312/goodreads_test.csv")

In [None]:
goodreads_train.sample(5)

In [None]:
goodreads_test.sample(5)

**Cleaning Data**

In [None]:
train_df = goodreads_train.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated', 'read_at', 'started_at'],
                                axis=0)
train_df.head()

In [None]:
train_df.dtypes

In [None]:
y_train = train_df['rating']
x_train = train_df.drop('rating', axis=1)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
x_train['review_id'] = le.fit_transform(x_train['review_id'])

In [None]:
x_train.head()

**NLP**

In [None]:
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token=OOV)

In [None]:
def get_sequences(tokenizer, review):
    sequences = tokenizer.texts_to_sequences(review)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=MAX_SIZE, padding='post')
    return padded_sequences

In [None]:
def tokenizer_func(data_rating, data_review):
    tokenizer.fit_on_texts(data_review)

    train_labels = data_rating.iloc[math.floor(int(len(data_rating) / 8)):]
    train_examples = data_review.iloc[math.floor(int(len(data_review) / 8)):]
    test_examples = data_review.iloc[:math.floor(int(len(data_review) / 8))]
    test_labels = data_rating.iloc[:math.floor(int(len(data_rating) / 8))]

    padded_train = get_sequences(tokenizer, train_examples)
    padded_test = get_sequences(tokenizer, test_examples)

    return np.array(padded_train), np.array(padded_test), np.array(train_labels), np.array(test_labels)

In [None]:
padded_train, padded_test, train_labels, test_labels = tokenizer_func(y_train, x_train['review_text'])

In [None]:
padded_train[0]

**Sarcasm detection**

In [None]:
data = pd.read_json('kaggle/input/sarcasmjson/sarcasm.json', lines=True)

In [None]:
# Iterating through the json data and loading the requisite values into our python lists
sentences = data['headline']
labels = data['is_sarcastic']

In [None]:
training_sentences = sentences[0:SARCASM_TRAINING_SIZE]
testing_sentences = sentences[SARCASM_TRAINING_SIZE:]

In [None]:
training_labels = labels[0:SARCASM_TRAINING_SIZE]
testing_labels = labels[SARCASM_TRAINING_SIZE:]

In [None]:
tokenizer.fit_on_texts(training_sentences)

In [None]:
# Creating training sequences and padding them
training_padded = get_sequences(tokenizer, training_sentences)
testing_padded = get_sequences(tokenizer, testing_sentences)

In [None]:
# Converting all variables to numpy arrays, to be able to work with tf version 2
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
# Creating a model for sentiment analysis
sarcasm_model = tf.keras.Sequential([
    # Adding an Embedding layer for Neural Network to learn the vectors
    tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SIZE),
    # Global Average pooling is similar to adding up vectors in this case
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
sarcasm_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

In [None]:
sarcasm_model.fit(training_padded, training_labels, epochs=EPOCHS,
                    validation_data=(testing_padded, testing_labels))

In [None]:
sarcasm_prediction_train = sarcasm_model.predict(padded_train)
sarcasm_prediction_test = sarcasm_model.predict(padded_test)

**Reshaping data**


In [None]:
sarcasm_prediction_train.shape

In [None]:
padded_train.shape

In [None]:
padded_train = np.concatenate((padded_train, np.array(sarcasm_prediction_train.flatten())[:, None]), axis=1)
padded_test = np.concatenate((padded_test, np.array(sarcasm_prediction_test.flatten())[:, None]), axis=1)

In [None]:
padded_train.shape

In [None]:
padded_train = np.reshape(padded_train, (1 - math.floor(len(y_train) / 8), 28, 28))
padded_test = np.reshape(padded_test, (math.floor(int(len(x_train['review_text']) / 8)), 28, 28))

In [None]:
padded_train[0]

**CNN Training**

In [None]:
cnn_model = tf.keras.models.Sequential()

In [None]:
cnn_model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(8, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(16, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.Conv2D(32, (3, 3), activation=tf.keras.activations.tanh, padding='same'))
cnn_model.add(tf.keras.layers.MaxPool2D())

cnn_model.add(tf.keras.layers.Flatten())
cnn_model.add(tf.keras.layers.Dense(32, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
cnn_model.add(tf.keras.layers.Dense(16, activation=tf.keras.activations.relu))  # tf.keras.activations.tanh
cnn_model.add(tf.keras.layers.Dense(6,
                                    activation=tf.keras.activations.softmax))  # model.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.softmax))

In [None]:
cnn_model.compile(optimizer=tf.keras.optimizers.SGD(0.1, momentum=0.9),
                  loss=tf.keras.losses.categorical_crossentropy,
                  metrics=[tf.keras.metrics.categorical_accuracy])

In [None]:
cnn_model.fit(padded_train,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS
              )

**Testing model**

In [None]:
padded_train = padded_train / 255.0
padded_test = padded_test / 255.0

train_labels = tf.keras.utils.to_categorical(train_labels, 6)
test_labels = tf.keras.utils.to_categorical(test_labels, 6)

padded_train = np.expand_dims(padded_train, -1)
padded_test = np.expand_dims(padded_test, -1)

cnn_predict = cnn_model.predict(padded_train)

In [None]:
cnn_model.summary()

**Submission**

In [None]:
sample_submission = pd.read_csv("../input/goodreads-books-reviews-290312/goodreads_sample_submission.csv")
preds = np.argmax(cnn_predict, axis=1)
sample_submission.rating = preds
sample_submission.to_csv("submission.csv", index=False)