# <b> Sentiment Analysis - Sarcasm

In [1]:
#-----------------------------------------------------
# Libraries
#-----------------------------------------------------

# Python 
import pandas as pd
import numpy as np
import json
from src import DataProcessing

# Machine Learning 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# NLP
import nltk 
import spacy
nlp = spacy.load("en_core_web_sm")

### <b> Downloading data directly from Kaggle 

### <b> Loading a json file

In [2]:
df = pd.read_json("data/Sarcasm_Headlines_Dataset_v2.json",  lines = True)

In [3]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


# <b> Clean Text

In [None]:
data_cleaning = DataProcessing.DataCleaning()

In [None]:
df.loc[:, 'clean text'] = df['headline'].apply(lambda sentence: data_cleaning.text_cleaning(sentence))

In [None]:
df.head()

# <b> Exploratory Data Analysis

In [None]:
df.isnull().sum()

## <b> Label Distribution

In [None]:
ax = df['is_sarcastic'].value_counts().plot(kind='barh')

## <b> Text Visualization

In [None]:
visualization = DataProcessing.Visualization(df, 'clean text')

### Words Frequency

In [None]:
# words frequency in a dataframe
df_frequency = visualization.words_frequency()

In [None]:
df_frequency[['words', 'freq']].head()

In [None]:
df_frequency[['words', 'freq']].tail()

###  Word Cloud

In [None]:
visualization.word_cloud()

### <b>  TDIDF

In [None]:
tf_idf_df = visualization.tf_idf_weights()

tf_idf_df.head()

In [None]:
tf_idf_df.tail()

Too many misspelled words lead to high weighted words

### <b>  Pos Tags

In [None]:
doc = [nlp(sentence[0:100]) for sentence in df['clean text']]

In [None]:
noun_pos_tags = []
prop_pos_tags = []
verb_pos_tags = []

for text in doc:
    for token in text:        
        if token.pos_ == 'NOUN':
            noun_pos_tags.append(token.text)
        elif token.pos_ == 'PROPN':
            prop_pos_tags.append(token.text)
        elif token.pos_ == 'VERB':
            verb_pos_tags.append(token.text)            

In [None]:
len(noun_pos_tags), len(prop_pos_tags), len(verb_pos_tags)

In [None]:
visualization.word_cloud(noun_pos_tags) 

In [None]:
visualization.word_cloud(prop_pos_tags)

In [None]:
visualization.word_cloud(verb_pos_tags)

# <b> Train a Neural Network

### <b> Configuring parameters

In [None]:
vocab_size = 1000
embedding_dim = 50
max_length = 300
padding_type='post'
oov_tok = "<OOV>"

sentences = df['clean text']
labels = df['is_sarcastic']

In [None]:
sentence_train, sentence_valid, label_train, label_valid = train_test_split(sentences, labels, test_size = 0.3, random_state = 42)
sentence_valid, sentence_test, label_valid, label_test = train_test_split(sentence_valid, label_valid, test_size = 0.3, random_state = 42)

In [None]:
len(sentence_train), len(sentence_valid), len(sentence_test)

In [None]:
len(label_train),  len(label_valid),  len(label_test)

### <b> Encoding

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# build vocabulary
tokenizer.fit_on_texts(sentence_train)
vocabulary = tokenizer.word_index

In [None]:
# Features
train_sequences = tokenizer.texts_to_sequences(sentence_train)
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding='post')

valid_sequences = tokenizer.texts_to_sequences(sentence_valid)
valid_padded_sequences = pad_sequences(valid_sequences, maxlen=max_length, padding='post')

test_sequences = tokenizer.texts_to_sequences(sentence_test)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [None]:
print("Original sentence = ", sentence_train[0])
print("Tokenized sentence = ", train_sequences[0])
print("Padded sentence = ", train_padded_sequences[0])

In [None]:
len(train_padded_sequences), len(valid_padded_sequences), len(test_padded_sequences)

### <b> Convolutional Neural Network
  

In [None]:
from tensorflow.keras import regularizers

sentence_input = tf.keras.Input(shape=(max_length,))
embeddings_layer = tf.keras.layers.Embedding(input_dim = vocab_size, 
                                              input_length = max_length, 
                                              output_dim= embedding_dim, name='embedding')
x = embeddings_layer(sentence_input) 
x = tf.keras.layers.Conv1D(128, 3, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
output = tf.keras.layers.Dense(2, activation='softmax')(x) 

cnn_model = tf.keras.Model(sentence_input, output, name="CNN_classifier")
cnn_model.compile(loss='sparse_categorical_crossentropy',
                 optimizer = tf.keras.optimizers.Adam(lr = 0.01),                 
                 metrics=['accuracy'])
cnn_model.summary()

In [None]:
cnn_model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 10
history = cnn_model.fit(train_padded_sequences, 
                    label_train, 
                    batch_size=2048,
                    epochs=num_epochs, 
                    validation_data=(valid_padded_sequences, label_valid), 
                    verbose=1)

### <b> Metrics

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
import seaborn as sea

In [None]:
y_pred = model.predict(test_padded_sequences)
y_pred = np.argmax(y_pred, axis=1)
y_test =  label_test

In [None]:
len(y_pred), len(y_test)

In [None]:
x_y_labels = ['Sarcastic', 'Not Sarcastic']

model_metrics = {
    'accuracy': round(accuracy_score(y_test, y_pred), 3),
    'f1': round(f1_score(y_test, y_pred, average='micro', zero_division=0), 3),
    'recall': round(recall_score(y_test, y_pred, average='micro', zero_division=0),3),
    'precision': round(precision_score(y_test, y_pred, average='micro', zero_division=0),3)

}        
report = classification_report(y_test, y_pred, target_names=x_y_labels, zero_division=0)    
cfm = confusion_matrix(y_test, y_pred)
cf = sea.heatmap(cfm, annot=True, cmap='Greens',  fmt='', cbar=False, xticklabels=x_y_labels, yticklabels=x_y_labels)