In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import string

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [2]:
data_types = {"recommendationid":"string","author":"string", 
             "language" : "string", 
             "timestamp_created":"string", "timestamp_updated":"string", 
             "voted_up":"string", "votes_up":int, "votes_funny":int, 
             "weighted_vote_score":float, "comment_count":int, 
             "steam_purchase":"string", "received_for_free":"string", 
             "written_during_early_access":"string", 
             "timestamp_dev_responded":"string", "developer_response":"string",
             "Text":"string", "target":int}

train = pd.read_csv("../input/sentiments/training_data.csv", dtype = data_types)
i = 0
for t in train["Text"]:
    if type(t)== pd._libs.missing.NAType:
        train["Text"][i] = ""
    i = i+1

In [3]:
train = train[['Text', 'target']]

In [4]:
longrev_index = [x  for x in range(len(train['Text'])) if len(train['Text'][x].split(' '))>150]
train = train.drop(longrev_index)
train.describe

<bound method NDFrame.describe of                                                      Text  target
0                              fun open world time killer       1
1       holy shit game truly live hype play hours far ...       1
2       good game polish people make great game poland...       1
3       like weak finally get around play year update ...       1
4                                     great game get over       1
...                                                   ...     ...
144341  buggy horrible graphics starwars fall order bl...       0
144342  feel like I wait enough know game gonna end ho...       0
144343                                                yes       1
144344  not let hype blind game poor state right perha...       0
144345                                         kool guess       1

[135270 rows x 2 columns]>

In [5]:
tk = TweetTokenizer(reduce_len=True)
X = train['Text'].tolist()
Y = train['target'].tolist()
data = []
for x, y in zip(X, Y):
    if y == 1:
        data.append((tk.tokenize(x), 1))
    else:
        data.append((tk.tokenize(x), 0))

In [6]:
STOP_WORDS = stopwords.words('english')
def cleaned(token):
    if token == 'u':
        return 'you'
    if token == 'r':
        return 'are'
    if token == 'some1':
        return 'someone'
    if token == 'yrs':
        return 'years'
    if token == 'hrs':
        return 'hours'
    if token == 'mins':
        return 'minutes'
    if token == 'secs':
        return 'seconds'
    if token == 'pls' or token == 'plz':
        return 'please'
    if token == '2morow':
        return 'tomorrow'
    if token == '2day':
        return 'today'
    if token == '4got' or token == '4gotten':
        return 'forget'
    if token == 'amp' or token == 'quot' or token == 'lt' or token == 'gt' or token == '½25':
        return ''
    return token

def remove_noise(tweet_tokens):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        token = re.sub("\'", "", token)  

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        cleaned_token = cleaned(token.lower())
        if cleaned_token not in string.punctuation and len(cleaned_token) > 2 and cleaned_token not in STOP_WORDS:
            cleaned_tokens.append(cleaned_token)
            
    return cleaned_tokens

In [7]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [8]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../input/sentiments/glove.6B.50d.txt')

In [9]:
cleaned_tokens_list = []
for tokens, label in data:
    cleaned_tokens_list.append((remove_noise(tokens), label))

In [10]:
unks = []
UNKS = []

def cleared(word):
    res = ""
    prev = None
    for char in word:
        if char == prev: continue
        prev = char
        res += char
    return res


def sentence_to_indices(sentence_words, word_to_index, max_len, i):
    global X, Y
    sentence_indices = []
    for j, w in enumerate(sentence_words):
        try:
            index = word_to_index[w]
        except:
            UNKS.append(w)
            w = cleared(w)
            try:
                index = word_to_index[w]
            except:
                index = word_to_index['unk']
                unks.append(w)
        X[i, j] = index



list_len = [len(i) for i, j in cleaned_tokens_list]
max_len = max(list_len)
print('max_len:', max_len)

X = np.zeros((len(cleaned_tokens_list), max_len))
Y = np.zeros((len(cleaned_tokens_list), ))

for i, tk_lb in enumerate(cleaned_tokens_list):
    tokens, label = tk_lb
    sentence_to_indices(tokens, word_to_index, max_len, i)
    Y[i] = label

max_len: 146


In [11]:

def embedding_layer(word_to_vec_map, word_to_index, max_len):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["unk"].shape[0]    
    emb_matrix = np.zeros((vocab_len, emb_dim))    
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]        
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False, input_shape=(max_len,))
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [12]:
model = Sequential()
model.add(embedding_layer(word_to_vec_map, word_to_index, max_len))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=1, activation='sigmoid'))

model.summary()

2022-02-26 12:04:59.858786: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 146, 50)           20000050  
_________________________________________________________________
bidirectional (Bidirectional (None, 146, 256)          183296    
_________________________________________________________________
dropout (Dropout)            (None, 146, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 146, 256)          394240    
_________________________________________________________________
dense (Dense)                (None, 146, 64)           16448     
_________________________________________________________________
dropout_1 (Dropout)          (None, 146, 64)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 146, 1)            6

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)

In [15]:
len(X_train[0])

146

In [16]:
Y

array([1., 1., 1., ..., 1., 0., 1.])

In [17]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = 20, batch_size = 2560, shuffle=True, verbose = 1)

2022-02-26 12:05:02.544334: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3b8c4e3dd0>