# Quara

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

## Load and Preprocess Data

In [14]:
class DataLoadTransformer:
    
    def __init__(self, seed=123):
        self.seed = seed
    
    def load(self, file_names):
        self.train_df = pd.read_csv(file_names['train'])
        self.test_df = pd.read_csv(file_names['test'])
        print("Train shape : ", self.train_df.shape)
        print("Test shape : ", self.test_df.shape)
        
    def transform(self, max_features=90000, max_len=100):
        # load libraries
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        
        ## fill up the missing values
        ## need to load numpy
        self.train_X = self.train_df['question_text'].fillna('_##_').values
        self.test_X = self.test_df['question_text'].fillna('_##_').values
        
        ## Tokenize the sentences
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(list(self.train_X))
        self.word_index = tokenizer.word_index
        self.train_X = tokenizer.texts_to_sequences(self.train_X)
        self.test_X = tokenizer.texts_to_sequences(self.test_X)
        
        ## Pad the sentences 
        self.train_X = pad_sequences(self.train_X, maxlen=max_len)
        self.test_X = pad_sequences(self.test_X, maxlen=max_len)
        
        ## Get the target values for training
        self.train_y = self.train_df['target'].values
        
        ## Shufflee the data
        np.random.seed(self.seed)
        train_idx = np.random.permutation(len(self.train_X))
        self.train_X, self.train_y = self.train_X[train_idx], self.train_y[train_idx]
        
        print("Data transformed.")
        return self.train_X, self.test_X, self.train_y, tokenizer.sequences_to_texts

## Load embeddings & transform text to vectors

In [46]:
class Embeddings:
    
    ## load embeddings
    def load(self, embedding_file_name):
        self.embeddings_index = {}
        f = open(embedding_file_name)
        for line in tqdm(f):
            values = line.split(' ');
            word = values[0]
            coef = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coef
        self.embed_len = len(coef) # length of embeddings
        f.close()
        
    def text_to_vec(self, text, max_text_len=30):
        empyt_emb = np.zeros(self.embed_len)
        embeds = [self.embeddings_index.get(word, empyt_emb) for word in text]
        embeds+= [empyt_emb] * (max_text_len - len(embeds))
        
        return np.array(embeds)
    
    def sequences_to_vec(self, sequences, sequences_to_texts, max_text_len=30):
        texts = sequences_to_texts(sequences)
        vectors = [self.text_to_vec(text[:max_text_len], max_text_len) for text in tqdm(texts)]
        
        return np.array(vectors)

# The Main Part

#### Load and transfor the data

In [47]:
file_names = {'train': 'data/train.csv', 'test': 'data/test.csv'}
max_features = 950000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

dlt = DataLoadTransformer()
dlt.load(file_names)
train_X, test_X, train_y, sequences_to_texts = dlt.transform(max_features=max_features, max_len=maxlen)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)
[677319 802133 357211 ...  28030 277869 773630]
Data transformed.


In [48]:
from sklearn.model_selection import train_test_split

val_size = 0.1
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, stratify=train_y)

#### Load embeddings and transform to vectors the inputs

In [49]:
embedding_file_name = 'data/embeddings/glove.840B.300d/glove.840B.300d.txt'
emb = Embeddings()
emb.load(embedding_file_name)

2196017it [02:45, 13301.19it/s]


In [50]:
max_text_len = 30
vec_val_X = emb.sequences_to_vec(val_X, sequences_to_texts, max_text_len)

100%|██████████| 130613/130613 [00:16<00:00, 7925.82it/s] 


#### Batch generator

In [None]:
def batch_gen(X, y, batch_size, seed=123):
    import math
    n_batches = math.ceil(len(X) / batch_size)
    while True:
        ## Shufflee the data
        np.random.seed(seed)
        permuted_idx = np.random.permutation(len(X))
        X, y = X[permuted_idx], y[permuted_idx]
        
    