### Preprocessing of the IMDB

In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import sys
sys.path.append("..")

In [2]:
!wget -nc https://s3.amazonaws.com/text-datasets/imdb_full.pkl .
!wget -nc https://s3.amazonaws.com/text-datasets/imdb_word_index.json .

--2021-05-06 21:56:44--  https://s3.amazonaws.com/text-datasets/imdb_full.pkl
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.170.24
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.170.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65552540 (63M) [application/octet-stream]
Saving to: ‘imdb_full.pkl’


2021-05-06 21:57:18 (1.85 MB/s) - ‘imdb_full.pkl’ saved [65552540/65552540]

--2021-05-06 21:57:18--  http://./
Resolving . (.)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘.’
FINISHED --2021-05-06 21:57:18--
Total wall clock time: 34s
Downloaded: 1 files, 63M in 34s (1.85 MB/s)
--2021-05-06 21:57:18--  https://s3.amazonaws.com/text-datasets/imdb_word_index.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.19.11
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.19.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1641221 (1.6M) [application/octet-stream]
Sa

In [3]:
import pickle
data = pickle.load(open('imdb_full.pkl', 'rb'))

import json
vocab = json.load(open('imdb_word_index.json'))

In [4]:
inv = {idx:word for word, idx in vocab.items()}

In [5]:
from sklearn.model_selection import train_test_split


(X_train, y_train), (Xt, yt) = data

trainidx = [i for i, x in enumerate(X_train) if len(x) < 400]
trainidx, devidx = train_test_split(trainidx, train_size=0.8, random_state=1378)
X = [X_train[i] for i in trainidx]
y = [y_train[i] for i in trainidx]

Xd = [X_train[i] for i in devidx]
yd = [y_train[i] for i in devidx]

testidx = [i for i, x in enumerate(Xt) if len(x) < 400]
testidx, remaining_idx =  train_test_split(testidx, train_size=0.2, random_state=1378)

Xt = [Xt[i] for i in testidx]
yt = [yt[i] for i in testidx]

def invert_and_join(X) :
    X = [[inv[x] for x in doc] for doc in X]
    X = [" ".join(x) for x in X]
    return X

X = invert_and_join(X)
Xd = invert_and_join(Xd)
Xt = invert_and_join(Xt)

In [6]:
texts = {'train' : X, 'test' : Xt,
         'dev' : Xd}
labels = {'train' : y, 'test' : yt, 'dev' : yd}

In [9]:
import pandas as pd
from preprocess_bc import cleaner

df_texts = []
df_labels = []
df_exp_splits = []

for key in ['train', 'test', 'dev'] :
    
    df_texts += texts[key]
    df_labels += labels[key]
    df_exp_splits += [key] * len(texts[key])
    
df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_splits})

df["text"] = df["text"].apply(lambda x: " ".join(cleaner(x)))

df.to_csv('imdb_dataset.csv', index=False)

In [10]:
df.head(10)

Unnamed: 0,text,label,exp_split
0,if you have any kind of heart and compassion f...,1,train
1,the name of this film alone made me want to se...,1,train
2,bugs life is a good film but to me it does n't...,1,train
3,rachel griffiths writes and directs this award...,1,train
4,after eagerly waiting to the end i have to say...,0,train
5,this was the most pointless film i have ever s...,0,train
6,how can you go wrong with the amazing ramones ...,1,train
7,a true story about a true revolution qqq of ap...,1,train
8,i watched this movie every chance i got back i...,1,train
9,everybody 's got bills to pay and that include...,1,train


In [11]:
len(df)

25879

### Forming our vocabulary, ixing data and saving it

In [12]:
from preprocess_bc import extract_vocabulary_

word_to_ix = extract_vocabulary_(min_df = 10, dataframe = df)

In [13]:
df["text"] = df["text"].apply(lambda x: ("<SOS> " + x + " <EOS>").split())

In [14]:
from preprocess_bc import text_to_seq

ix_to_word = {v:k for k,v in word_to_ix.items()}

train_ix = text_to_seq(df[df.exp_split == "train"][["text","label"]].values, word_to_ix)
dev_ix = text_to_seq(df[df.exp_split == "dev"][["text","label"]].values, word_to_ix)
test_ix = text_to_seq(df[df.exp_split == "test"][["text","label"]].values, word_to_ix)

### Preparing our embeddings

In [15]:
from preprocess_bc import pretrained_embeds, DataHolder_BC

pre = pretrained_embeds("glove.840B.300d", ix_to_word)

pretrained = pre.processed()

../.vector_cache/glove.840B.300d.zip: 2.18GB [20:39, 1.76MB/s]                                
100%|█████████▉| 2196016/2196017 [02:37<00:00, 13930.22it/s]


Found  12101  words out of  12147


In [16]:
data = DataHolder_BC(train_ix, dev_ix, test_ix, word_to_ix, embeds =  pretrained)

In [17]:
import pickle

pickle.dump(data, open("data.p", "wb"))

In [18]:
!rm imdb_word_index*
!rm imdb_full*