In [14]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

%cd /content/gdrive/My Drive/Kaggle

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Kaggle


In [0]:
pwd

'/content/gdrive/My Drive/Kaggle'

In [0]:
!kaggle datasets download -d columbine/imdb-dataset-sentiment-analysis-in-csv-format

Downloading imdb-dataset-sentiment-analysis-in-csv-format.zip to /content/gdrive/My Drive/Kaggle
 90% 23.0M/25.7M [00:00<00:00, 41.2MB/s]
100% 25.7M/25.7M [00:00<00:00, 58.0MB/s]


In [0]:
!unzip \*.zip && rm *.zip

Archive:  glove.6B.100d.txt.zip
  inflating: glove.6B.100d.txt       
  inflating: __MACOSX/._glove.6B.100d.txt  


In [1]:
import pandas as pd
import numpy as np
from keras.layers import Embedding, LSTM, Dropout, Activation, Dense, BatchNormalization, Input
from keras.models import Model
from keras.optimizers import Adam
import re 

Using TensorFlow backend.


In [0]:
test_path = "/content/gdrive/My Drive/Kaggle/Test.csv"
train_path = "/content/gdrive/My Drive/Kaggle/Train.csv"
valid_path = "/content/gdrive/My Drive/Kaggle/Valid.csv"
glove_path = "/content/gdrive/My Drive/Kaggle/glove.6B.100d.txt"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
valid_df = pd.read_csv(valid_path)

train_texts = np.asarray(train_df['text'])
test_texts = np.asarray(test_df['text'])
valid_texts = np.asarray(valid_df['text'])

train_labels = np.array(train_df['label'])
test_labels = np.array(test_df['label'])
valid_labels = np.array(valid_df['label'])

In [0]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def clean_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return np.asarray(normalized_texts)
        
train_texts_clean = clean_texts(train_texts)

In [0]:
def get_glove_vectors(file):
    with open(file , 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            word_to_vec_map[curr_word] = np.array(line[1:] , dtype = np.float64)
            words.add(curr_word)
        
        i = 1
        word_to_idx = {}
        idx_to_word = {}
        for word in sorted(words):
            word_to_idx[word] = i
            idx_to_word[i] = word
            i = i + 1
        return word_to_vec_map , word_to_idx , idx_to_word

word_to_vec , word_idx , idx_word = get_glove_vectors(glove_path)


In [0]:
def text_to_indices(texts , word_idx , max_len):
    m = texts.shape[0]
    text_idx = np.zeros((m,max_len))
    for i in range(m):
        words = texts[i].lower().split()
        j = 0
        for word in words:
            if word in word_idx.keys():
                text_idx[i,j] = word_idx[word]
            j = j + 1
            if j >= max_len:
              break
    
    return text_idx

In [0]:
def pretrained_embedding_layer(word_idx , word_to_vec):
    vocab_len = len(word_idx) + 1
    emb_dim = word_to_vec['the'].shape[0]
    emb_matrix = np.zeros((vocab_len , emb_dim))
    for word , idx in word_idx.items():
        emb_matrix[idx , :] = word_to_vec[word]

    emb_layer = Embedding(vocab_len , emb_dim , weights=[emb_matrix] , trainable = False)
    return emb_layer

In [0]:
def model(input_shape , word_idx , word_to_vec):
    
    text_indices = Input(input_shape , dtype = 'int32')
    
    embedding_layer = pretrained_embedding_layer(word_idx , word_to_vec)
    embeddings = embedding_layer(text_indices)
    X = LSTM(64 , return_sequences = False)(embeddings)
    X = BatchNormalization()(X)
    X = Dropout(0.3)(X)
    X = Dense(1)(X)
    X = Activation('sigmoid')(X)
    
    model = Model(inputs = text_indices , outputs = X)
    
    return model

In [0]:
m = train_texts.shape[0]
MaxLen = len(train_texts[0])
for j in range(1,m):
    if len(train_texts[j]) >= MaxLen:
        MaxLen = len(train_texts[j])

In [0]:
MaxLen = 200

In [10]:
X_train = text_to_indices(train_texts_clean , word_idx , MaxLen)
model = model((MaxLen,) , word_idx , word_to_vec)
model.compile(loss = 'binary_crossentropy' , optimizer = Adam(lr = 0.01) , metrics = ['accuracy'])
model.fit(X_train , train_labels , epochs = 20 , batch_size = 64 , shuffle = True , verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f949793fd68>

In [18]:
valid_texts_clean = clean_texts(valid_texts)
X_valid = text_to_indices(valid_texts_clean , word_idx , MaxLen)
model.evaluate(X_valid , valid_labels , batch_size= 32 , verbose = 1)



[0.4042071200311184, 0.8575999736785889]

In [20]:
test_texts_clean = clean_texts(test_texts)
X_valid = text_to_indices(test_texts_clean , word_idx , MaxLen)
model.evaluate(X_valid , test_labels , batch_size= 32 , verbose = 1)



[0.4507545642375946, 0.8489999771118164]