### Preprocessing of the SST

In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import sys
sys.path.append("..")
from torchtext import data
from torchtext import datasets

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
!wget -nc --no-check-certificate https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip .
!unzip trainDevTestTrees_PTB.zip
!mv trees/* .
!rm -r trees/
!rm *.zip
!touch __init__.py

--2020-02-17 17:20:34--  https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 789539 (771K) [application/zip]
Saving to: ‘trainDevTestTrees_PTB.zip’


2020-02-17 17:20:36 (426 KB/s) - ‘trainDevTestTrees_PTB.zip’ saved [789539/789539]

--2020-02-17 17:20:36--  http://./
Resolving . (.)... failed: Name or service not known.
wget: unable to resolve host address ‘.’
FINISHED --2020-02-17 17:20:36--
Total wall clock time: 2.4s
Downloaded: 1 files, 771K in 1.8s (426 KB/s)
Archive:  trainDevTestTrees_PTB.zip
   creating: trees/
  inflating: trees/dev.txt           
  inflating: trees/test.txt          
  inflating: trees/train.txt         


In [3]:
import glob

files = glob.glob("*.txt")

files

['train.txt', 'dev.txt', 'test.txt']

In [4]:
import nltk
a = nltk.corpus.BracketParseCorpusReader("", "(train|dev|test)\.txt")

text = {}
labels = {}
keys = ['train', 'dev', 'test']
for k in keys :
    text[k] = [x.leaves() for x in a.parsed_sents(k+'.txt') if x.label() != '2']
    labels[k] = [int(x.label()) for x in a.parsed_sents(k+'.txt') if x.label() != '2']
    print(len(text[k]))
    
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
import re

def tokenize(text) :
    text = " ".join(text)
    text = text.replace("-LRB-", '')
    text = text.replace("-RRB-", " ")
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    tokens = " ".join([t.text.lower() for t in nlp(text)])
    return tokens

for k in keys :
    text[k] = [tokenize(t) for t in text[k]]    
    labels[k] = [1 if x >= 3 else 0 for x in labels[k]]

6920
872
1821


In [5]:
import pandas as pd
df_texts = []
df_labels = []
df_exp_split = []

for k in keys :
    df_texts += text[k]
    df_labels += labels[k]
    df_exp_split += [k]*len(text[k])
    
df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_split}) 

In [6]:
from preprocess_bc import cleaner

df["text"] = df["text"].apply(lambda x: " ".join(cleaner(x)))

In [7]:
df.head(10)

Unnamed: 0,exp_split,label,text
0,train,1,the rock is destined to be the qqq century s n...
1,train,1,the gorgeously elaborate continuation of the l...
2,train,1,singer composer bryan adams contributes a slew...
3,train,1,yet the act is still charming here
4,train,1,whether or not you re enlightened by any of de...
5,train,1,just the labour involved in creating the layer...
6,train,1,part of the charm of satin rouge is that it av...
7,train,1,a screenplay more ingeniously constructed than...
8,train,1,extreme ops exceeds expectations
9,train,1,good fun good action good acting good dialogue...


In [8]:
df.to_csv('sst_dataset.csv', index=False)

In [9]:
from preprocess_bc import extract_vocabulary_

word_to_ix = extract_vocabulary_(min_df = 1, dataframe = df)

df["text"] = df["text"].apply(lambda x: ("<SOS> " + x + " <EOS>").split())

In [10]:
from preprocess_bc import text_to_seq

train_ix = text_to_seq(df[df.exp_split == "train"][["text","label"]].values, word_to_ix)
dev_ix = text_to_seq(df[df.exp_split == "dev"][["text","label"]].values, word_to_ix)
test_ix = text_to_seq(df[df.exp_split == "test"][["text","label"]].values, word_to_ix)

In [11]:
ix_to_word = {v:k for k,v in word_to_ix.items()}

### Preparing our embeddings

In [12]:
from preprocess_bc import pretrained_embeds, DataHolder_BC

pre = pretrained_embeds("fasttext.simple.300d", ix_to_word)

pretrained = pre.processed()

Found  11174  words out of  13686


In [13]:
data = DataHolder_BC(train_ix, dev_ix, test_ix, word_to_ix, embeds =  pretrained)

In [14]:
import pickle

pickle.dump(data, open("data.p", "wb"))