In [8]:
import csv
import pandas as pd
import os

path = os.getcwd()

In [9]:
# read all train data
all_raw_data_train = pd.read_csv(path + "/SNLI_MNLI_data/snli_train.tsv", sep='\t')
hypo_train=all_raw_data_train["sentence1"]
prem_train=all_raw_data_train["sentence2"]
label_train=all_raw_data_train["label"]

In [10]:
# read all validation data
all_raw_data_val = pd.read_csv(path + "/SNLI_MNLI_data/snli_val.tsv", sep='\t')
hypo_val=all_raw_data_val["sentence1"]
prem_val=all_raw_data_val["sentence2"]
label_val=all_raw_data_val["label"]

In [11]:
d ={'neutral': 0, 'entailment': 1, 'contradiction': 2}
label_index=label_train.map(d, na_action='ignore')
label_index_val=label_val.map(d, na_action='ignore')

In [12]:
# have a glance at the train data
all_raw_data_train.info()
print("snli_train dataset size is {}".format(len(all_raw_data_train)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
sentence1    100000 non-null object
sentence2    100000 non-null object
label        100000 non-null object
dtypes: object(3)
memory usage: 2.3+ MB
snli_train dataset size is 100000


In [13]:
# have a glance at the train hypothese and premise
print("There are {} distinct hypothesis in training data.".format(len(hypo_train.unique())))
print("There are {} distinct premise in training data.".format(len(prem_train.unique())))
print("The 3 labels are {}.".format(set(label_train)))

There are 72305 distinct hypothesis in training data.
There are 93549 distinct premise in training data.
The 3 labels are {'neutral', 'entailment', 'contradiction'}.


In [14]:
# have a glance at the validation data
all_raw_data_val.info()
print("snli_train dataset size is {}".format(len(all_raw_data_val)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
sentence1    1000 non-null object
sentence2    1000 non-null object
label        1000 non-null object
dtypes: object(3)
memory usage: 23.5+ KB
snli_train dataset size is 1000


In [15]:
# have a glance at the validation hypothese and premise
print("There are {} distinct hypothesis in validation data.".format(len(hypo_val.unique())))
print("There are {} distinct premise in validation data.".format(len(prem_val.unique())))
print("The 3 labels are {}.".format(set(label_val)))

There are 891 distinct hypothesis in validation data.
There are 999 distinct premise in validation data.
The 3 labels are {'neutral', 'entailment', 'contradiction'}.


In [16]:
# Random sample from train dataset
import random
t=random.randint(0, len(all_raw_data_train) - 1)
print (all_raw_data_train.iloc[t])

sentence1    A man driving a small , open vehicle with a gr...
sentence2                           A man stealing a vehicle .
label                                                  neutral
Name: 31649, dtype: object


# Tokenizing

In [17]:
import spacy 
import string

tokenizer=spacy.load("en_core_web_sm") # What does this do?
punctuations=string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

In [18]:
import pickle as pkl

In [19]:
def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

#train set tokens
print ("Tokenizing hypothesis_train")
hypo_data_tokens_train, all_hypo_data_tokens_train = tokenize_dataset(hypo_train)
pkl.dump(hypo_data_tokens_train, open("/all_data_pickle/hypo_data_tokens_train.p", "wb"))
pkl.dump(all_hypo_data_tokens_train, open("/all_data_pickle/all_hypo_data_tokens_train.p", "wb"))


print ("Tokenizing premise_train")
prem_data_tokens_train, all_prem_data_tokens_train = tokenize_dataset(prem_train)
pkl.dump(prem_data_tokens_train, open("/all_data_pickle/prem_data_tokens_train.p", "wb"))
pkl.dump(all_prem_data_tokens_train, open("/all_data_pickle/all_prem_data_tokens_train.p", "wb"))


# val set tokens
print ("Tokenizing hypothesis_val")
hypo_data_tokens_val, all_hypo_data_tokens_val = tokenize_dataset(hypo_val)
pkl.dump(hypo_data_tokens_val, open("/all_data_pickle/hypo_data_tokens_val.p", "wb"))
pkl.dump(all_hypo_data_tokens_val, open("/all_data_pickle/all_hypo_data_tokens_val.p", "wb"))


print ("Tokenizing premise_val")
prem_data_tokens_val, all_prem_data_tokens_val = tokenize_dataset(prem_val)
pkl.dump(prem_data_tokens_val, open("/all_data_pickle/prem_data_tokens_val.p", "wb"))
pkl.dump(all_prem_data_tokens_val, open("/all_data_pickle/all_prem_data_tokens_val.p", "wb"))

#test set tokens
#print ("Tokenizing test data")
#test_data_tokens, _ = tokenize_dataset(test_data)
#pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

#train set tokens
#print ("Tokenizing train data")
#train_data_tokens, all_train_tokens = tokenize_dataset(train_data)
#pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
#pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))

Tokenizing hypothesis_train


KeyboardInterrupt: 

# MultiNLI dataset

In [21]:
import pandas as pd
import os
import spacy 
import string
import pickle as pkl


def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

# lowercase and remove punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

path = os.getcwd()
# read all train data
data_train = pd.read_csv(path + "/SNLI_MNLI_data/mnli_train.tsv", sep='\t')

In [29]:
for genre_choose in ['fiction', 'government', 'slate', 'telephone', 'travel']:
    
    
    all_raw_data_train = data_train.loc[data_train.genre == genre_choose].drop(['genre'],axis=1).reset_index(drop=True)
    hypo_train=all_raw_data_train["sentence1"]
    prem_train=all_raw_data_train["sentence2"]
    label_train=all_raw_data_train["label"]
    # read all validation data
    data_val = pd.read_csv(path + "/SNLI_MNLI_data/mnli_val.tsv", sep='\t')
    all_raw_data_val = data_val.loc[data_val.genre == genre_choose].drop(['genre'],axis=1).reset_index(drop=True)
    hypo_val=all_raw_data_val["sentence1"]
    prem_val=all_raw_data_val["sentence2"]
    label_val=all_raw_data_val["label"]
    d ={'neutral': 0, 'entailment': 1, 'contradiction': 2}
    label_index_train =label_train.map(d, na_action='ignore')
    label_index_val =label_val.map(d, na_action='ignore')

    tokenizer=spacy.load("en_core_web_sm") # What does this do?
    punctuations=string.punctuation

    #train set tokens
    print ("Tokenizing hypothesis_train")
    hypo_data_tokens_train, all_hypo_data_tokens_train = tokenize_dataset(hypo_train)
    pkl.dump(hypo_data_tokens_train, open("/all_data_pickle/hypo_data_tokens_train_{}.p".format(genre_choose), "wb"))
    pkl.dump(all_hypo_data_tokens_train, open("/all_data_pickle/all_hypo_data_tokens_train_{}.p".format(genre_choose), "wb"))


    print ("Tokenizing premise_train")
    prem_data_tokens_train, all_prem_data_tokens_train = tokenize_dataset(prem_train)
    pkl.dump(prem_data_tokens_train, open("/all_data_pickle/prem_data_tokens_train_{}.p".format(genre_choose), "wb"))
    pkl.dump(all_prem_data_tokens_train, open("/all_data_pickle/all_prem_data_tokens_train_{}.p".format(genre_choose), "wb"))


    # val set tokens
    print ("Tokenizing hypothesis_val")
    hypo_data_tokens_val, all_hypo_data_tokens_val = tokenize_dataset(hypo_val)
    pkl.dump(hypo_data_tokens_val, open("/all_data_pickle/hypo_data_tokens_val_{}.p".format(genre_choose), "wb"))
    pkl.dump(all_hypo_data_tokens_val, open("/all_data_pickle/all_hypo_data_tokens_val_{}.p".format(genre_choose), "wb"))


    print ("Tokenizing premise_val")
    prem_data_tokens_val, all_prem_data_tokens_val = tokenize_dataset(prem_val)
    pkl.dump(prem_data_tokens_val, open("/all_data_pickle/prem_data_tokens_val_{}.p".format(genre_choose), "wb"))
    pkl.dump(all_prem_data_tokens_val, open("/all_data_pickle/all_prem_data_tokens_val_{}.p".format(genre_choose), "wb"))

    pkl.dump(label_index_train, open("/all_data_pickle/label_index_train_{}.p".format(genre_choose), "wb"))
    pkl.dump(label_index_val, open("/all_data_pickle/label_index_val_{}.p".format(genre_choose), "wb"))
    
    print('{} done!\n'.format(genre_choose))

Tokenizing hypothesis_train
Tokenizing premise_train
Tokenizing hypothesis_val
Tokenizing premise_val
fiction done!

Tokenizing hypothesis_train
Tokenizing premise_train
Tokenizing hypothesis_val
Tokenizing premise_val
government done!

Tokenizing hypothesis_train
Tokenizing premise_train
Tokenizing hypothesis_val
Tokenizing premise_val
slate done!

Tokenizing hypothesis_train
Tokenizing premise_train
Tokenizing hypothesis_val
Tokenizing premise_val
telephone done!

Tokenizing hypothesis_train
Tokenizing premise_train
Tokenizing hypothesis_val
Tokenizing premise_val
travel done!

