In [91]:
from sklearn.linear_model import LogisticRegression
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import pickle
import nltk
import numpy as np


In [92]:

nltk.download('stopwords')

stemmer = SnowballStemmer("english")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:

# importing the dataset
DATASET_ENCODING = "ISO-8859-1"
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
# df = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

df = pd.read_csv('./IMDB Dataset.csv', delimiter=',',
                      encoding=DATASET_ENCODING)
# df = pd.read_csv('./Corona_NLP_train.csv',
  # delimiter=',', encoding=DATASET_ENCODING)
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'
# dataset_dir = 'sentiment140'
model_dir = './models/'+dataset_dir
vector_dir = './vectors/'+dataset_dir

# removing the unnecessary columns and duplicates
# dataset = dataset[['OriginalTweet','Sentiment']]
# df = df[['tweet', 'sentiment']]
df = df[['review', 'sentiment']]
df.drop_duplicates()

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [94]:
# Preprocessing
from nltk.corpus import stopwords
import re
import string


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


def remove_mention(text):
    return re.sub("@[A-Za-z0-9]+", "", text)


def stem_tweets(tweet):
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


# remove stopwords


stop = set(stopwords.words("english"))


def remove_stopwords(text):
    stop = set(stopwords.words("english"))
    
    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

def preprocess_tweets(tweet):
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_punct(tweet)
    tweet = stem_tweets(tweet)
    tweet = remove_stopwords(tweet)
    return tweet


In [95]:
# df = df.head(5)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [96]:

# df['tweet'] = df.tweet.apply(preprocess_tweets)
df['review'] = df.review.apply(preprocess_tweets)
# df['OriginalTweet'] = df.OriginalTweet.apply(preprocess_tweets)

X = df['review']
# X = df['tweet']
# X = df['OriginalTweet']
y = df['sentiment']
# y = df['Sentiment']
df.head()


Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product br br film techniqu veri ...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive


In [97]:
from collections import Counter

# Count unique words


def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


# counter = counter_word(df.tweet)
# counter = counter_word(df.OriginalTweet)
counter = counter_word(df.review)

len(counter)


143417

In [98]:
num_unique_words = len(counter)
counter

Counter({'one': 53301,
         'review': 4199,
         'mention': 2984,
         'watch': 27276,
         '1': 2524,
         'oz': 263,
         'episod': 4846,
         'youll': 2628,
         'hook': 578,
         'right': 6793,
         'exact': 2352,
         'happen': 6927,
         'mebr': 451,
         'br': 113794,
         'first': 17322,
         'thing': 16117,
         'struck': 267,
         'brutal': 916,
         'unflinch': 39,
         'scene': 20699,
         'violenc': 2006,
         'set': 7545,
         'word': 3557,
         'go': 17717,
         'trust': 711,
         'show': 19405,
         'faint': 134,
         'heart': 2624,
         'timid': 55,
         'pull': 1845,
         'punch': 527,
         'regard': 939,
         'drug': 1638,
         'sex': 3234,
         'hardcor': 252,
         'classic': 4135,
         'use': 10065,
         'wordbr': 15,
         'call': 5433,
         'nicknam': 83,
         'given': 3540,
         'oswald': 32,
         

In [99]:
counter.most_common(5)


[('br', 113794),
 ('movi', 98955),
 ('film', 92055),
 ('one', 53301),
 ('like', 43986)]

In [100]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
# train_sentences = train_df.tweet.to_numpy()
# train_labels = train_df.sentiment.to_numpy()
# val_sentences = val_df.tweet.to_numpy()
# val_labels = val_df.sentiment.to_numpy()

# train_sentences = train_df.OriginalTweet.to_numpy()
# train_labels = train_df.Sentiment.to_numpy()
# val_sentences = val_df.OriginalTweet.to_numpy()
# val_labels = val_df.Sentiment.to_numpy()

train_sentences = train_df.review.to_numpy()
train_labels = train_df.sentiment.to_numpy()
val_sentences = val_df.review.to_numpy()
val_labels = val_df.sentiment.to_numpy()


In [101]:
def convert_sentiment_to_int(sentiment):
    return 1 if sentiment == 'positive' else 0


convert_sentiment_to_int_v = np.vectorize(convert_sentiment_to_int)


In [102]:
train_labels = convert_sentiment_to_int_v(train_labels)

val_labels = convert_sentiment_to_int_v(val_labels)

train_labels, val_labels

(array([1, 1, 1, ..., 1, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]))

In [103]:
train_sentences.shape, val_sentences.shape


((40000,), (10000,))

In [104]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)  # fit only to training


In [105]:
# each word has unique index
word_index = tokenizer.word_index
word_index


{'br': 1,
 'movi': 2,
 'film': 3,
 'one': 4,
 'like': 5,
 'time': 6,
 'good': 7,
 'make': 8,
 'get': 9,
 'see': 10,
 'veri': 11,
 'charact': 12,
 'watch': 13,
 'even': 14,
 'stori': 15,
 'would': 16,
 'onli': 17,
 'realli': 18,
 'scene': 19,
 'show': 20,
 'well': 21,
 'look': 22,
 'bad': 23,
 'much': 24,
 'great': 25,
 'end': 26,
 'peopl': 27,
 'also': 28,
 'love': 29,
 'go': 30,
 'becaus': 31,
 'think': 32,
 'first': 33,
 'play': 34,
 'act': 35,
 'dont': 36,
 'way': 37,
 'thing': 38,
 'made': 39,
 'could': 40,
 'ani': 41,
 'know': 42,
 'seem': 43,
 'say': 44,
 'mani': 45,
 'plot': 46,
 'work': 47,
 'two': 48,
 'actor': 49,
 'seen': 50,
 'want': 51,
 'come': 52,
 'take': 53,
 'never': 54,
 'tri': 55,
 'best': 56,
 'littl': 57,
 'year': 58,
 'life': 59,
 'ever': 60,
 'doe': 61,
 'give': 62,
 'better': 63,
 'man': 64,
 'still': 65,
 'find': 66,
 'perform': 67,
 'feel': 68,
 'part': 69,
 'whi': 70,
 'use': 71,
 'actual': 72,
 'someth': 73,
 'lot': 74,
 'interest': 75,
 'im': 76,
 'back': 

In [106]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)


In [107]:

print(train_sentences[0])
print(train_sequences[0])


one review mention watch 1 oz episod youll hook right exact happen mebr br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordbr br call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awaybr br would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may be

In [108]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 175

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape


((40000, 175), (10000, 175))

In [109]:
train_padded[3]


array([  394,   146,    57,   254,  2810,    32,   566,  3223,   608,
         288,  1332,     1,     2,  6506,  1614,  1228,   763,  2810,
         315,   148,  5437,   112, 28305,     1,   533,    33,    30,
           8,     3,   142,   315,   620,   389,   389,     2,  1587,
         608,  2600,  2013,     5,    80,    59,  2810,  3223,   264,
        1014,     3,   162,    10,  9011,   518,     2,   253,    13,
         389,  3056,   620, 28306,     1,   396,   248,    21,    34,
         608,  3972,   600,   165,  2810,   952,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [110]:
print(train_sentences[3])
print(train_sequences[3])
print(train_padded[3])


basic famili littl boy jake think zombi closet parent fight timebr br movi slower soap opera sudden jake decid becom rambo kill zombiebr br ok first go make film must decid thriller drama drama movi watchabl parent divorc argu like real life jake closet total ruin film expect see boogeyman similar movi instead watch drama meaningless thriller spotsbr br 3 10 well play parent descent dialog shot jake ignor
[394, 146, 57, 254, 2810, 32, 566, 3223, 608, 288, 1332, 1, 2, 6506, 1614, 1228, 763, 2810, 315, 148, 5437, 112, 28305, 1, 533, 33, 30, 8, 3, 142, 315, 620, 389, 389, 2, 1587, 608, 2600, 2013, 5, 80, 59, 2810, 3223, 264, 1014, 3, 162, 10, 9011, 518, 2, 253, 13, 389, 3056, 620, 28306, 1, 396, 248, 21, 34, 608, 3972, 600, 165, 2810, 952]
[  394   146    57   254  2810    32   566  3223   608   288  1332     1
     2  6506  1614  1228   763  2810   315   148  5437   112 28305     1
   533    33    30     8     3   142   315   620   389   389     2  1587
   608  2600  2013     5    80    

In [111]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
reverse_word_index


{1: 'br',
 2: 'movi',
 3: 'film',
 4: 'one',
 5: 'like',
 6: 'time',
 7: 'good',
 8: 'make',
 9: 'get',
 10: 'see',
 11: 'veri',
 12: 'charact',
 13: 'watch',
 14: 'even',
 15: 'stori',
 16: 'would',
 17: 'onli',
 18: 'realli',
 19: 'scene',
 20: 'show',
 21: 'well',
 22: 'look',
 23: 'bad',
 24: 'much',
 25: 'great',
 26: 'end',
 27: 'peopl',
 28: 'also',
 29: 'love',
 30: 'go',
 31: 'becaus',
 32: 'think',
 33: 'first',
 34: 'play',
 35: 'act',
 36: 'dont',
 37: 'way',
 38: 'thing',
 39: 'made',
 40: 'could',
 41: 'ani',
 42: 'know',
 43: 'seem',
 44: 'say',
 45: 'mani',
 46: 'plot',
 47: 'work',
 48: 'two',
 49: 'actor',
 50: 'seen',
 51: 'want',
 52: 'come',
 53: 'take',
 54: 'never',
 55: 'tri',
 56: 'best',
 57: 'littl',
 58: 'year',
 59: 'life',
 60: 'ever',
 61: 'doe',
 62: 'give',
 63: 'better',
 64: 'man',
 65: 'still',
 66: 'find',
 67: 'perform',
 68: 'feel',
 69: 'part',
 70: 'whi',
 71: 'use',
 72: 'actual',
 73: 'someth',
 74: 'lot',
 75: 'interest',
 76: 'im',
 77: 'bac

In [112]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])


decoded_text = decode(train_sequences[3])

print(train_sequences[3])
print(decoded_text)


[394, 146, 57, 254, 2810, 32, 566, 3223, 608, 288, 1332, 1, 2, 6506, 1614, 1228, 763, 2810, 315, 148, 5437, 112, 28305, 1, 533, 33, 30, 8, 3, 142, 315, 620, 389, 389, 2, 1587, 608, 2600, 2013, 5, 80, 59, 2810, 3223, 264, 1014, 3, 162, 10, 9011, 518, 2, 253, 13, 389, 3056, 620, 28306, 1, 396, 248, 21, 34, 608, 3972, 600, 165, 2810, 952]
basic famili littl boy jake think zombi closet parent fight timebr br movi slower soap opera sudden jake decid becom rambo kill zombiebr br ok first go make film must decid thriller drama drama movi watchabl parent divorc argu like real life jake closet total ruin film expect see boogeyman similar movi instead watch drama meaningless thriller spotsbr br 3 10 well play parent descent dialog shot jake ignor


In [113]:
# Create LSTM model
from tensorflow.keras import layers
import keras

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 175, 32)           4589344   
                                                                 
 lstm_3 (LSTM)               (None, 64)                24832     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4,614,241
Trainable params: 4,614,241
Non-trainable params: 0
_________________________________________________________________


In [114]:
from tensorflow import keras

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)


  super(Adam, self).__init__(name, **kwargs)


In [116]:
model.fit(train_padded, train_labels, epochs=3,
          validation_data=(val_padded, val_labels), verbose=1)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x21b4ece7a58>

In [117]:
predictions = model.predict(train_padded)
predictions

array([[0.6217112 ],
       [0.8129069 ],
       [0.812907  ],
       ...,
       [0.8129069 ],
       [0.27922726],
       [0.27929798]], dtype=float32)

In [118]:

predictions = [1 if p > 0.5 else 0 for p in predictions]
print(train_sentences[:3])

print(train_labels[:3])
print(predictions[:3])


['one review mention watch 1 oz episod youll hook right exact happen mebr br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordbr br call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awaybr br would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may 

In [119]:

val_loss, val_acc = model.evaluate(val_padded, val_labels)
val_loss, val_acc




(0.5756871104240417, 0.7447999715805054)

In [120]:
model.save(f'{model_dir}/MNB_model_{val_acc}')




INFO:tensorflow:Assets written to: ./models/imdb/MNB_model_0.7447999715805054\assets


INFO:tensorflow:Assets written to: ./models/imdb/MNB_model_0.7447999715805054\assets
