# Text Classification

In [None]:
# Imports

from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

## The Problem: Large Movie Dataset Review
### Classify movie reviews from IMDB into positive or negative sentiment.
### Download the dataset [here](https://drive.google.com/drive/u/0/folders/1hYw0TQbwcM2YWEiKN-2c_kHCPtHO_TMC)

In [None]:
# Importing & preprocessing the dataset

train_ds = text_dataset_from_directory('aclImdb/train')
test_ds = text_dataset_from_directory('aclImdb/test')

dfTrain = pd.DataFrame(train_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])
dfTest = pd.DataFrame(test_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])
_, xts = train_test_split(dfTest, stratify=dfTest['label'], test_size=0.25)

In [None]:
# Look at a sample movie review

print(dfTrain.loc[0, 'text'])

In [None]:
# Look at the label of the review

dfTrain.loc[0, 'label']

In [None]:
pd.options.display.max_colwidth = 100
dfTrain.head()

In [None]:
dfTest.head()

In [None]:
# Feature Extraction - Text to TFIDF

vect = TfidfVectorizer(stop_words='english')
XTrain = vect.fit_transform(dfTrain['text']).toarray()
XTest = vect.transform(xts['text']).toarray()

In [None]:
XTrain.shape

In [None]:
# Assemble and compile the neural network

model = Sequential([
    Dense(128, input_shape=(XTrain.shape[1],), activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer=SGD(lr=1e-3), metrics=['accuracy'])

In [None]:
# Train the neural network

history = model.fit(XTrain, dfTrain['label'], batch_size=64, validation_data=(XTest, xts['label']), epochs=50)

In [None]:
# Visualize the learning curve

plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.legend()