# Classification with CNNs and word embeddings

Code courtesy of [this guy](https://github.com/dipanjanS/nlp_workshop_odsc19/blob/master/Module05%20-%20NLP%20Applications/Project07B%20-%20Text%20Classification%20Deep%20Learning%20CNN%20Models.ipynb)

In [None]:
# setup script wasn't working in class - uncomment if you get errors loading modules
#!pip install nltk beautifulsoup4 contractions tensorflow scikit-learn

In [None]:
# simple text processing tools
import re
import tqdm
import unicodedata
import contractions
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')

# data wranling
import pandas as pd
import numpy as np

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, 
                                    Flatten,
                                    Conv1D, 
                                    MaxPooling1D, 
                                    Embedding)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.regularizers import L2

# scikit-learn
from sklearn.metrics import (confusion_matrix, 
                            classification_report)
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# visualisations 
import matplotlib.pyplot as plt
%matplotlib inline


# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

## Helper functions for text processing

In [None]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = contractions.fix(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  
        norm_docs.append(doc)
  
    return norm_docs

## Import data

In [None]:
dataset = pd.read_csv(r'https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2011%20-%20Sentiment%20Analysis%20-%20Unsupervised%20Learning/movie_reviews.csv.bz2', compression='bz2')
dataset.info()

In [None]:
dataset.head()

## Manually split data

In [None]:
# build train and test datasets
reviews = dataset['review'].values
sentiments = dataset['sentiment'].values

X_train = reviews[:35000]
y_train = sentiments[:35000]

X_test = reviews[35000:]
y_test = sentiments[35000:]

This could be done with scikit-learn train-test split function.

## Clean and normalize data

In [None]:
X_train_norm = pre_process_corpus(X_train)
X_test_norm = pre_process_corpus(X_test)

## Preprocessing

In [None]:
# define out-of-vocabulary token
t = Tokenizer(oov_token = '<UNK>')

# fit the tokenizer on then documents
t.fit_on_texts(X_train_norm)

# set padding value
t.word_index["<PAD>"] = 0 

### Tokenize sequences

In [None]:
X_train_seqs = t.texts_to_sequences(X_train_norm)
X_test_seqs = t.texts_to_sequences(X_test_norm)

In [None]:
print(f"Vocabulary size={len(t.word_index)}")
print(f"Number of Documents={t.document_count}")

In [None]:
train_lens = [len(s) for s in X_train_seqs]
test_lens = [len(s) for s in X_test_seqs]

fig, ax = plt.subplots(1,2, figsize=(12, 6))
h1 = ax[0].hist(train_lens)
h2 = ax[1].hist(test_lens)

## Sequence normalization

In [None]:
MAX_SEQUENCE_LENGTH = 1000

In [None]:
# add padding to sequences
X_train_pad = sequence.pad_sequences(X_train_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_test_pad = sequence.pad_sequences(X_test_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [None]:
X_train_pad.shape, X_test_pad.shape

## Encoding labels

In [None]:
# create one-hot encodings - Do not use!
lb = LabelBinarizer()
y_train_lb = lb.fit_transform(y_train)
y_test_lb = lb.fit_transform(y_test)

## Create and compile model

In [None]:
# define paramaters for model
# overall vocublarly size
VOCAB_SIZE = len(t.word_index)
# number of dimensions for embeddings
EMBED_SIZE = 300
# number of epochs to train for
EPOCHS = 2
# batch size for training
BATCH_SIZE = 128

In [None]:
# create the model
model = Sequential()
# embedding layer
model.add(Embedding(VOCAB_SIZE, 
                    EMBED_SIZE, 
                    input_length=MAX_SEQUENCE_LENGTH))

# first convolution layer and pooling
model.add(Conv1D(filters=128, 
                        kernel_size=4, 
                        padding='same',
                        activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# second convolution layer and pooling
model.add(Conv1D(filters=64, 
                        kernel_size=4, 
                        padding='same', 
                        activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=32, 
                        kernel_size=4, 
                        padding='same', 
                        activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# fully-connected classification layer
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
                        optimizer='adam', 
                        metrics=['accuracy'])
# print model summary
model.summary()

## Train

In [None]:
model.fit(X_train_pad, y_train_lb,
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        validation_split = 0.1,
        verbose = True)

## Evaluate

In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test_pad, y_test_lb, verbose=1)
print(f"Accuracy: {scores[1]}")

In [None]:
# 0.5 decision boundary
predictions = (model.predict(X_test_pad) > 0.5).astype("int32")
# assign labels
predictions = ['positive' if item == 1 else 'negative' for item in predictions]
predictions[:10]

In [None]:
# confusion matrix and classification report
labels = ['negative', 'positive']
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions), 
             index=labels, columns=labels)