In [1]:
import re
import gc
import pickle
import numpy as np
import pandas as ps
from tqdm import tqdm
from pathlib import Path
from itertools import chain
from collections import Counter


import matplotlib.pyplot as plt
import seaborn as sbn
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

In [2]:
data_dir = Path('..') / 'data'
embeddings_dir = data_dir / 'embeddings'

In [3]:
!tree ../data

[01;34m../data[00m
├── [01;34membeddings[00m
│   ├── crawl-300d-2M.pkl
│   ├── [01;31mcrawl-300d-2M.pkl.zip[00m
│   ├── glove.840B.300d.pkl
│   ├── [01;31mglove.840B.300d.pkl.zip[00m
│   └── glove_crawl_emb.pkl
├── sample_submission.csv
├── test.csv
├── test.pkl
├── train.csv
├── [01;31mtrain.csv.zip[00m
├── train.pkl
└── valid.pkl

1 directory, 12 files


## Embedings

Embedings downloaded from this kernel - [**Quest Q&A - LSTM Inference Only**](https://www.kaggle.com/chanhu/quest-q-a-lstm-inference-baseline).

In [4]:
train = ps.read_csv(data_dir / 'train.csv')
print(train.shape)

(6079, 41)


In [5]:
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [6]:
test = ps.read_csv(data_dir / 'test.csv')
print(test.shape)

(476, 11)


In [7]:
test.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host'],
      dtype='object')

In [8]:
targets = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'    
]

text_columns = [
    'question_title', 
    'question_body', 
    'answer'
]

In [9]:
def get_coefs(word: str, *arr):
    return word, np.asarray(arr, dtype='float32')

In [10]:
def load_embeddings(path: str):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

In [11]:
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index), 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= len(word_index):
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
                        
    return embedding_matrix, unknown_words

In [12]:
remove_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
number_pattern = re.compile("(\d+)")

def tokenize(s: str, lower: bool = True, split: str = " ") -> list:
    if lower:
        s = s.lower()
    # replacing part
    s = number_pattern.sub(" \1 ", s)
    for c in remove_chars:
        s = s.replace(c, split)
        
    return s.split(split)
    

def build_vocab(train_data, test_data, fields):
    word_counter = Counter()
    for f in fields:
        for d in (train_data, test_data):
            word_counter.update(chain.from_iterable([tokenize(s) for s in d[f].values]))
    
    words = sorted([w for w, cnt in word_counter.most_common()])
    return words

In [13]:
%%time

words = build_vocab(train, test, text_columns)
len(words)

CPU times: user 608 ms, sys: 16.1 ms, total: 624 ms
Wall time: 624 ms


51838

In [14]:
idx2word = ["<space>"] + words
word2idx = {w: idx for idx, w in enumerate(idx2word)}
len(word2idx)

51839

In [15]:
from functools import partial

def str2tokens(s: str, t2i: dict) -> list:
    tokens = tokenize(s)
    return [t2i[t] for t in tokens]

s2t = partial(str2tokens, t2i=word2idx)

In [16]:
%%time

crawl_matrix, unknown_words_crawl = build_matrix(word2idx, embeddings_dir / 'crawl-300d-2M.pkl')
print('n unknown words (crawl): ', len(unknown_words_crawl))

n unknown words (crawl):  17180
CPU times: user 4 s, sys: 1.15 s, total: 5.15 s
Wall time: 5.15 s


In [17]:
%%time

glove_matrix, unknown_words_glove = build_matrix(word2idx, embeddings_dir / 'glove.840B.300d.pkl')
print('n unknown words (glove): ', len(unknown_words_glove))

n unknown words (glove):  17450
CPU times: user 4.21 s, sys: 744 ms, total: 4.96 s
Wall time: 4.96 s


In [18]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
print(embedding_matrix.shape)

del crawl_matrix
del glove_matrix
gc.collect()

(51839, 600)


29

In [19]:
with open(embeddings_dir / 'glowe_crawl_emb.pkl', 'wb') as f:
    embedding_matrix.dump(f)

embedding_matrix.shape

(51839, 600)

In [20]:
tmp = np.load(embeddings_dir / 'glowe_crawl_emb.pkl', allow_pickle=True)

In [21]:
tmp.shape

(51839, 600)

In [22]:
for c in tqdm(text_columns):
    train[c] = train[c].apply(s2t)
    test[c] = test[c].apply(s2t)

100%|██████████| 3/3 [00:00<00:00,  5.24it/s]


In [23]:
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,"[49451, 25588, 10553, 26847, 4301, 47656, 2945...","[21754, 39739, 36116, 22707, 31608, 2093, 2141...",Dylan,https://gaming.stackexchange.com/users/64471,"[45128, 23667, 30448, 10058, 17727, 25588, 105...",Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,"[47699, 26099, 45666, 16798, 22124, 22373, 450...","[21754, 3435, 30141, 45666, 49724, 1, 21754, 2...",Anu,https://wordpress.stackexchange.com/users/72927,"[21754, 45220, 23837, 23667, 34515, 49581, 113...",Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"[23667, 2292, 1, 36818, 31872, 7088, 42356, 28...","[45666, 16200, 21754, 42691, 1980, 6626, 18488...",Konsta,https://gaming.stackexchange.com/users/37545,"[50462, 13676, 30686, 20612, 4286, 22373, 4503...",Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,"[43540, 19701, 3568, 21754, 31068, 15613, 1814...","[21754, 20612, 47760, 29459, 36420, 33727, 441...",robbannn,https://raspberrypi.stackexchange.com/users/17341,"[50474, 49226, 13060, 20459, 14096, 23667, 132...",HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,"[32907, 29761, 1, 1, 17466, 6524, 32907, 31610...","[21754, 20612, 6653, 12375, 26502, 38019, 1746...",Amit,https://travel.stackexchange.com/users/29089,"[21754, 7404, 46652, 33439, 49347, 49732, 1772...",Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [24]:
# train_df = train
# valid_df = train.sample(frac=.2)

train_df, valid_df = train_test_split(train, test_size=0.1, random_state=2019)

print(train_df.shape)
print(valid_df.shape)

(5471, 41)
(608, 41)


In [25]:
results_dir = data_dir


with open(results_dir / 'train.pkl', 'wb') as f:
    pickle.dump(train_df, f)
    
    
with open(results_dir / 'valid.pkl', 'wb') as f:
    pickle.dump(valid_df, f)
    
    
with open(results_dir / 'test.pkl', 'wb') as f:
    pickle.dump(test, f)