In [1]:
# Import and data read

import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviews = "".join([char for char in reviews if char not in string.punctuation])

all_words = nltk.word_tokenize(reviews)
all_words = [word for word in all_words if word.lower() not in stop_words]


reviews = reviews.split('\n')
labels = labels.split('\n')

In [3]:
reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  
labels = [1 if label == "positive" else 0 for label in labels]

In [4]:
empty_idx = []
for i, review in enumerate(reviews_tokenized):
  if len(review) == 0:
    empty_idx.append(i)
    
for i in empty_idx:
  reviews_tokenized.pop(i)
  labels.pop(i)

In [5]:
empty_idx

[25000]

In [6]:
vocab_size = 2000

word_counter = Counter(all_words)
word_counter = dict(word_counter.most_common(vocab_size))
word2index = {k:i for i,k in enumerate(word_counter.keys(), start = 1)}

reviews_int = []
for review in reviews_tokenized:
  cur_review = []
  for word in review:
    if word in word2index.keys():
      cur_review.append(word2index[word])
  reviews_int.append(cur_review)

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_reviews = pad_sequences(reviews_int, maxlen = 200)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_reviews, labels, test_size = 0.2, random_state = 1)

# X_train = X_train.reshape(20000, 200, 1)

y_train = np.array(y_train).reshape(20000, 1)
y_test = np.array(y_test).reshape(5000, 1)

In [30]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf


inputs = Input(shape = (X_train.shape[1:]))
x = Embedding(input_dim = vocab_size, output_dim = 128, input_length = 200)(inputs)
x = Conv1D(filters = 200, kernel_size = 13, strides = 1, padding = 'same', activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = GRU(128, return_sequences = True)(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = GRU(128, return_sequences = False)(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(512, activation = 'relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = inputs, outputs = outputs)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 200)]             0         
                                                                 
 embedding_10 (Embedding)    (None, 200, 128)          256000    
                                                                 
 conv1d_7 (Conv1D)           (None, 200, 200)          333000    
                                                                 
 dropout (Dropout)           (None, 200, 200)          0         
                                                                 
 gru_14 (GRU)                (None, 200, 128)          126720    
                                                                 
 dropout_1 (Dropout)         (None, 200, 128)          0         
                                                                 
 gru_15 (GRU)                (None, 128)               9907

In [None]:
history = model.fit(X_train, y_train, epochs = 100, batch_size = 128, validation_data = (X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [9]:
len(all_words)

3088793

In [10]:
len(word_counter)

2000

In [11]:
len(word2index)

2000

In [12]:
for review in reviews:
  if '0' in review:
    print('hi')

In [20]:
reviews_tokenized[-1]

[]

In [18]:
len(y_test)

5001

In [35]:
my_list = [1,2,3]

my_list.pop(1)

2

In [36]:
my_list

[1, 3]