In [125]:
import tensorflow as tf

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [126]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

import re
import pandas as pd
import pickle
import nltk
import numpy as np


In [127]:

nltk.download('stopwords')
nltk.download('omw-1.4')

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Enes\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [128]:

# importing the dataset
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
df = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

# df = pd.read_csv('./IMDB Dataset.csv', delimiter=',',
                #  encoding=DATASET_ENCODING)
# df = pd.read_csv('./Corona_NLP_train.csv',
# delimiter=',', encoding=DATASET_ENCODING)

# dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'
dataset_dir = 'sentiment140'

model_dir = './models/'+dataset_dir
vector_dir = './vectors/'+dataset_dir


In [129]:

# removing the unnecessary columns and duplicates
# dataset = dataset[['OriginalTweet','Sentiment']]
df = df[['tweet', 'sentiment']]
# df = df[['review', 'sentiment']]

df.drop_duplicates()

df.head()


Unnamed: 0,tweet,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [130]:
# Preprocessing
from nltk.corpus import stopwords
import re
import string

RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)


def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


def remove_mention(text):
    return re.sub("@[A-Za-z0-9]+", "", text)


def stem_tweets(tweet):
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


def lemmatize_tweets(tweet):
    tokens = tweet.split()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# remove stopwords


stop = set(stopwords.words("english"))


def remove_stopwords(text):
    stop = set(stopwords.words("english"))

    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)


def preprocess_tweets(tweet):
    tweet = strip_emoji(tweet)
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_punct(tweet)
    tweet = stem_tweets(tweet)
    # tweet = lemmatize_tweets(tweet)
    tweet = remove_stopwords(tweet)
    return tweet


In [131]:
# df = df.head(5)

df.head()


Unnamed: 0,tweet,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [132]:
df.iloc[:, 0] = df.iloc[:, 0].apply(preprocess_tweets)

X = df.iloc[:, 0]

y = df.iloc[:, 1]

df.head()


Unnamed: 0,tweet,sentiment
0,awww bummer shoulda got david carr third day,0
1,upset cant updat facebook text might cri resul...,0
2,dive mani time ball manag save 50 rest go bound,0
3,whole bodi feel itchi like fire,0
4,behav im mad whi becaus cant see,0


In [133]:
from collections import Counter

# Count unique words


def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.iloc[:, 0])

len(counter)


402749

In [134]:
num_unique_words = len(counter)
counter


Counter({'awww': 5018,
         'bummer': 1458,
         'shoulda': 348,
         'got': 61108,
         'david': 2457,
         'carr': 90,
         'third': 828,
         'day': 101324,
         'upset': 2970,
         'cant': 62609,
         'updat': 9039,
         'facebook': 4315,
         'text': 5475,
         'might': 9603,
         'cri': 8387,
         'result': 1657,
         'school': 20367,
         'today': 66099,
         'also': 10266,
         'blah': 1469,
         'dive': 267,
         'mani': 8989,
         'time': 64340,
         'ball': 1926,
         'manag': 2497,
         'save': 3677,
         '50': 1279,
         'rest': 5775,
         'go': 137027,
         'bound': 399,
         'whole': 5974,
         'bodi': 2657,
         'feel': 50654,
         'itchi': 453,
         'like': 82963,
         'fire': 1892,
         'behav': 226,
         'im': 177562,
         'mad': 4115,
         'whi': 27516,
         'becaus': 13464,
         'see': 50796,
         'c

In [135]:
counter.most_common(5)


[('im', 177562),
 ('go', 137027),
 ('get', 109958),
 ('day', 101324),
 ('good', 90572)]

In [136]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.iloc[:, 0].to_numpy()
train_labels = train_df.iloc[:, 1].to_numpy()
val_sentences = val_df.iloc[:, 0].to_numpy()
val_labels = val_df.iloc[:, 1].to_numpy()


In [137]:
def convert_sentiment_to_binary(sentiment):
    if dataset_dir=='sentiment140':
        return 1 if sentiment == 4 else 0
    return 1 if sentiment == 'positive' else 0


convert_sentiment_to_int_v = np.vectorize(convert_sentiment_to_binary)


In [138]:
train_labels = convert_sentiment_to_int_v(train_labels)

val_labels = convert_sentiment_to_int_v(val_labels)

train_labels, val_labels


(array([0, 0, 0, ..., 1, 1, 1]), array([1, 1, 1, ..., 1, 1, 1]))

In [139]:
train_sentences.shape, val_sentences.shape


((1280000,), (320000,))

In [140]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)  # fit only to training


In [141]:
# each word has unique index
word_index = tokenizer.word_index
word_index


{'<OOV>': 1,
 'im': 2,
 'go': 3,
 'get': 4,
 'day': 5,
 'work': 6,
 'like': 7,
 'good': 8,
 'dont': 9,
 'love': 10,
 'cant': 11,
 'today': 12,
 'miss': 13,
 'time': 14,
 'want': 15,
 'got': 16,
 'back': 17,
 'feel': 18,
 'one': 19,
 'know': 20,
 'realli': 21,
 'lol': 22,
 'think': 23,
 'u': 24,
 'see': 25,
 'thank': 26,
 'still': 27,
 'need': 28,
 'well': 29,
 'night': 30,
 'hope': 31,
 'make': 32,
 'home': 33,
 'watch': 34,
 'amp': 35,
 '2': 36,
 'oh': 37,
 'new': 38,
 'sad': 39,
 'come': 40,
 'last': 41,
 'look': 42,
 'wish': 43,
 'much': 44,
 'tomorrow': 45,
 'sleep': 46,
 'twitter': 47,
 'morn': 48,
 'bad': 49,
 'ill': 50,
 'wait': 51,
 'whi': 52,
 'didnt': 53,
 'great': 54,
 'onli': 55,
 'sorri': 56,
 'week': 57,
 'tri': 58,
 'right': 59,
 'would': 60,
 'hate': 61,
 'haha': 62,
 'veri': 63,
 'thing': 64,
 'fun': 65,
 'tonight': 66,
 'friend': 67,
 'though': 68,
 'say': 69,
 'happi': 70,
 'take': 71,
 'way': 72,
 'gonna': 73,
 'ive': 74,
 'even': 75,
 'follow': 76,
 'could': 77,
 '

In [142]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)


In [143]:

print(train_sentences[0])
print(train_sequences[0])


awww bummer shoulda got david carr third day
[390, 942, 2848, 16, 710, 8228, 1654, 5]


In [144]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
# Not needed for padding as it picks the right length
max_length = 255

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape


((1280000, 255), (320000, 255))

In [145]:
train_padded[3]


array([ 338,  637,   18, 2256,    7,  891,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [146]:
print(train_sentences[3])
print(train_sequences[3])
print(train_padded[3])


whole bodi feel itchi like fire
[338, 637, 18, 2256, 7, 891]
[ 338  637   18 2256    7  891    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 

In [147]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
reverse_word_index


{1: '<OOV>',
 2: 'im',
 3: 'go',
 4: 'get',
 5: 'day',
 6: 'work',
 7: 'like',
 8: 'good',
 9: 'dont',
 10: 'love',
 11: 'cant',
 12: 'today',
 13: 'miss',
 14: 'time',
 15: 'want',
 16: 'got',
 17: 'back',
 18: 'feel',
 19: 'one',
 20: 'know',
 21: 'realli',
 22: 'lol',
 23: 'think',
 24: 'u',
 25: 'see',
 26: 'thank',
 27: 'still',
 28: 'need',
 29: 'well',
 30: 'night',
 31: 'hope',
 32: 'make',
 33: 'home',
 34: 'watch',
 35: 'amp',
 36: '2',
 37: 'oh',
 38: 'new',
 39: 'sad',
 40: 'come',
 41: 'last',
 42: 'look',
 43: 'wish',
 44: 'much',
 45: 'tomorrow',
 46: 'sleep',
 47: 'twitter',
 48: 'morn',
 49: 'bad',
 50: 'ill',
 51: 'wait',
 52: 'whi',
 53: 'didnt',
 54: 'great',
 55: 'onli',
 56: 'sorri',
 57: 'week',
 58: 'tri',
 59: 'right',
 60: 'would',
 61: 'hate',
 62: 'haha',
 63: 'veri',
 64: 'thing',
 65: 'fun',
 66: 'tonight',
 67: 'friend',
 68: 'though',
 69: 'say',
 70: 'happi',
 71: 'take',
 72: 'way',
 73: 'gonna',
 74: 'ive',
 75: 'even',
 76: 'follow',
 77: 'could',
 7

In [148]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])


decoded_text = decode(train_sequences[3])

print(train_sequences[3])
print(decoded_text)


[338, 637, 18, 2256, 7, 891]
whole bodi feel itchi like fire


In [149]:
# Create LSTM model
from tensorflow.keras import layers
import keras

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
# model.add(layers.Embedding(num_unique_words, 32))
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 255, 32)           12887968  
                                                                 
 lstm_3 (LSTM)               (None, 64)                24832     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 12,912,865
Trainable params: 12,912,865
Non-trainable params: 0
_________________________________________________________________


In [150]:
from tensorflow import keras

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)


  super(Adam, self).__init__(name, **kwargs)


In [151]:
model.fit(train_padded, train_labels, epochs=2,
          validation_data=(val_padded, val_labels), verbose=1)


Epoch 1/2

KeyboardInterrupt: 

In [None]:
predictions = model.predict(train_padded)
predictions


In [None]:

predictions = [1 if p > 0.5 else 0 for p in predictions]
print(train_sentences[:3])

print(train_labels[:3])
print(predictions[:3])


In [None]:

val_loss, val_acc = model.evaluate(val_padded, val_labels)
val_loss, val_acc


In [None]:
model.save(f'{model_dir}/tensorflow_NN_model_{val_acc}')
