In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, glob, os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from six.moves import zip
import ipywidgets, stanza, json

stanza.download('en')
stanford_tokenizer = stanza.Pipeline('en',processors='tokenize', use_gpu= True)

def customPTBTokenizer(texts, isLower=True, oov_token=None): 
    word_counts = OrderedDict()
    chunk_size = 5000
    for i in range(0, len(texts), chunk_size):
        text = ' '.join(map(str, list(map(lambda x: x.lower(), texts[i:i+chunk_size]))))
        seq = [word['text'] for doc in stanford_tokenizer(text).to_dict() for word in doc]
        for w in seq:
            word_counts[w] = 1 if w not in word_counts else word_counts[w] + 1
        print(len(word_counts))
    
    wcounts = list(word_counts.items())
    wcounts.sort(key=lambda x: x[1], reverse=True)
    sorted_voc = [] if oov_token is None else [oov_token]
    sorted_voc.extend(wc[0] for wc in wcounts)
        
    word_index = dict(zip(wcounts, list(range(1, len(wcounts) + 1))))
    index_word = {c: w for w, c in word_index.items()}

    return index_word
    
aclImdb_path = '../../../../Datasets/7_nlp/0_aclImdb_v1/csv'

aclImdb_train = pd.read_csv(os.path.join(aclImdb_path, 'Train.csv'))
aclImdb_val = pd.read_csv(os.path.join(aclImdb_path, 'Valid.csv')) 

#train_test split
x_train, y_train = aclImdb_train['text'].values, aclImdb_train['label'].values
x_val, y_val = aclImdb_val['text'].values, aclImdb_val['label'].values

x_train_test = customPTBTokenizer(x_train)

#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_train))

#converting text into integer sequences
x_train_seq  = tokenizer.texts_to_sequences(x_train) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

#padding to prepare sequences of same length
x_train_seq_pad  = pad_sequences(x_train_seq, maxlen=100)
x_val_seq_pad = pad_sequences(x_val_seq, maxlen=100)

size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

# load the whole embedding into memory
embeddings_index = dict()
f = open('../../../../Datasets/7_nlp/pretrained_models/glove.6B/glove.6B.300d.txt', encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   â€¦

2022-03-24 21:57:51 INFO: Downloading default packages for language: en (English)...
2022-03-24 21:57:52 INFO: File exists: C:\Users\chaut\stanza_resources\en\default.zip.
2022-03-24 21:57:56 INFO: Finished downloading models and saved to C:\Users\chaut\stanza_resources.
2022-03-24 21:57:56 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2022-03-24 21:57:57 INFO: Use device: gpu
2022-03-24 21:57:57 INFO: Loading: tokenize
2022-03-24 21:58:00 INFO: Done loading processors!


43570
61153
74289
84918
94428
103285
111220
118449
112204
Loaded 400000 word vectors.


In [2]:
import sys
from keras.models import *
from keras.layers import *
from keras.callbacks import *
NO_IN = 300
NO_OUT = 1
TIME_STEPS = 100
PADDING_THRESHOLD = 50

def prePadding(data, label, time_step=100, padding_threshold=60):
    padded_label = [] 
    padded_data = []
    for record_index, record in enumerate(data):
        padded_record = []
        if (len(record) % time_step) > padding_threshold:
            padding_vector = [0] * (time_step - (len(record) % time_step))
            padded_record = padding_vector + record
        else:
            padded_record = record[(len(record) % time_step):]
        
        for i in range(len(padded_record) // time_step):
            padded_data.append(padded_record[i*time_step:(i+1)*time_step])
            padded_label.append(label[record_index])
            
    return np.array(padded_data), np.array(padded_label).reshape(-1,1)

x_train_seq_padded, y_train_padded = (prePadding(x_train_seq, y_train, time_step=TIME_STEPS, padding_threshold=PADDING_THRESHOLD))
x_val_seq_padded,   y_val_padded   = (prePadding(x_val_seq,   y_val, time_step=TIME_STEPS,   padding_threshold=PADDING_THRESHOLD))

with tf.device('cpu:0'):
    embedding_layer = Embedding(size_of_vocabulary,NO_IN,weights=[embedding_matrix],input_length=TIME_STEPS,trainable=False, dtype=tf.float16)
    embedding_layer.build(x_train_seq_padded.shape)
    with open(fr"{aclImdb_path}/aclImdb.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.pt={PADDING_THRESHOLD}.train.csv",'w') as csvfile:
        np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
    with open(fr"{aclImdb_path}/aclImdb.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.pt={PADDING_THRESHOLD}.train.csv",'a') as csvfile:
        np.savetxt(csvfile, tf.reshape(tf.concat([embedding_layer(x_train_seq_padded), tf.convert_to_tensor(np.expand_dims(np.tile(y_train_padded, TIME_STEPS), axis=2), dtype=tf.float16)], axis=2), [x_train_seq_padded.shape[0], -1]).numpy(), delimiter=",")
   
    with open(fr"{aclImdb_path}/aclImdb.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.pt={PADDING_THRESHOLD}.val.csv",'w') as csvfile:
        np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
    with open(fr"{aclImdb_path}/aclImdb.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.pt={PADDING_THRESHOLD}.val.csv",'a') as csvfile:
        np.savetxt(csvfile, tf.reshape(tf.concat([embedding_layer(x_val_seq_padded), tf.convert_to_tensor(np.expand_dims(np.tile(y_val_padded, TIME_STEPS), axis=2), dtype=tf.float16)], axis=2), [x_val_seq_padded.shape[0], -1]).numpy(), delimiter=",")