In [None]:
from tensorflow.python.client import device_lib
import tensorflow as tf

print(tf.config.list_physical_devices('GPU'))

In [None]:
from sklearn.linear_model import LogisticRegression
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import pickle
import nltk
import numpy as np


In [None]:

nltk.download('stopwords')

stemmer = SnowballStemmer("english")


In [None]:

# importing the dataset
DATASET_ENCODING = "ISO-8859-1"
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
# df = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

df = pd.read_csv('./IMDB Dataset.csv', delimiter=',',
                 encoding=DATASET_ENCODING)
# df = pd.read_csv('./Corona_NLP_train.csv',
# delimiter=',', encoding=DATASET_ENCODING)
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'
# dataset_dir = 'sentiment140'
model_dir = './models/'+dataset_dir
vector_dir = './vectors/'+dataset_dir

# removing the unnecessary columns and duplicates
# dataset = dataset[['OriginalTweet','Sentiment']]
# df = df[['tweet', 'sentiment']]
df = df[['review', 'sentiment']]
df.drop_duplicates()

df.head()


In [None]:
# Preprocessing
from nltk.corpus import stopwords
import re
import string


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


def remove_mention(text):
    return re.sub("@[A-Za-z0-9]+", "", text)


def stem_tweets(tweet):
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


# remove stopwords


stop = set(stopwords.words("english"))


def remove_stopwords(text):
    stop = set(stopwords.words("english"))

    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)


def preprocess_tweets(tweet):
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_punct(tweet)
    tweet = stem_tweets(tweet)
    tweet = remove_stopwords(tweet)
    return tweet


In [None]:
# df = df.head(5)

df.head()


In [None]:

# df['tweet'] = df.tweet.apply(preprocess_tweets)
df['review'] = df.review.apply(preprocess_tweets)
# df['OriginalTweet'] = df.OriginalTweet.apply(preprocess_tweets)

X = df['review']
# X = df['tweet']
# X = df['OriginalTweet']
y = df['sentiment']
# y = df['Sentiment']
df.head()


In [None]:
from collections import Counter

# Count unique words


def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


# counter = counter_word(df.tweet)
# counter = counter_word(df.OriginalTweet)
counter = counter_word(df.review)

len(counter)


In [None]:
num_unique_words = len(counter)
counter


In [None]:
counter.most_common(5)


In [None]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
# train_sentences = train_df.tweet.to_numpy()
# train_labels = train_df.sentiment.to_numpy()
# val_sentences = val_df.tweet.to_numpy()
# val_labels = val_df.sentiment.to_numpy()

# train_sentences = train_df.OriginalTweet.to_numpy()
# train_labels = train_df.Sentiment.to_numpy()
# val_sentences = val_df.OriginalTweet.to_numpy()
# val_labels = val_df.Sentiment.to_numpy()

train_sentences = train_df.review.to_numpy()
train_labels = train_df.sentiment.to_numpy()
val_sentences = val_df.review.to_numpy()
val_labels = val_df.sentiment.to_numpy()


In [None]:
def convert_sentiment_to_int(sentiment):
    return 1 if sentiment == 'positive' else 0


convert_sentiment_to_int_v = np.vectorize(convert_sentiment_to_int)


In [None]:
train_labels = convert_sentiment_to_int_v(train_labels)

val_labels = convert_sentiment_to_int_v(val_labels)

train_labels, val_labels


In [None]:
train_sentences.shape, val_sentences.shape


In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)  # fit only to training


In [None]:
# each word has unique index
word_index = tokenizer.word_index
word_index


In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)


In [None]:

print(train_sentences[0])
print(train_sequences[0])


In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 175

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape


In [None]:
train_padded[3]


In [None]:
print(train_sentences[3])
print(train_sequences[3])
print(train_padded[3])


In [None]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
reverse_word_index


In [None]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])


decoded_text = decode(train_sequences[3])

print(train_sequences[3])
print(decoded_text)


In [None]:
# Create LSTM model
from tensorflow.keras import layers
import keras

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()


In [None]:
from tensorflow import keras

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)


In [None]:
model.fit(train_padded, train_labels, epochs=3,
          validation_data=(val_padded, val_labels), verbose=1)


In [None]:
predictions = model.predict(train_padded)
predictions


In [None]:

predictions = [1 if p > 0.5 else 0 for p in predictions]
print(train_sentences[:3])

print(train_labels[:3])
print(predictions[:3])


In [None]:

val_loss, val_acc = model.evaluate(val_padded, val_labels)
val_loss, val_acc


In [None]:
model.save(f'{model_dir}/MNB_model_{val_acc}')
