In [1]:
import re
import gc
import pickle
import numpy as np
import pandas as ps
from tqdm import tqdm
from pathlib import Path
from itertools import chain
from collections import Counter


import matplotlib.pyplot as plt
import seaborn as sbn
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

In [2]:
data_dir = Path('..') / 'data'
embeddings_dir = data_dir / 'embeddings'

In [3]:
!tree ../data

[01;34m../data[00m
├── [01;34membeddings[00m
│   ├── crawl-300d-2M.pkl
│   ├── [01;31mcrawl-300d-2M.pkl.zip[00m
│   ├── glove.840B.300d.pkl
│   ├── [01;31mglove.840B.300d.pkl.zip[00m
│   └── glove_crawl_emb.pkl
├── sample_submission.csv
├── test.csv
├── test.pkl
├── train.csv
├── [01;31mtrain.csv.zip[00m
├── train.pkl
├── valid.pkl
├── vocab.pkl
└── vocab.txt

1 directory, 14 files


## Embedings

Embedings downloaded from this kernel - [**Quest Q&A - LSTM Inference Only**](https://www.kaggle.com/chanhu/quest-q-a-lstm-inference-baseline).

In [4]:
train = ps.read_csv(data_dir / 'train.csv')
print(train.shape)

(6079, 41)


In [5]:
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [6]:
test = ps.read_csv(data_dir / 'test.csv')
print(test.shape)

(476, 11)


In [7]:
test.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host'],
      dtype='object')

In [8]:
targets = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'    
]

text_columns = [
    'question_title', 
    'question_body', 
    'answer'
]

In [9]:
def get_coefs(word: str, *arr):
    return word, np.asarray(arr, dtype='float32')

In [10]:
def load_embeddings(path: str):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

In [11]:
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index), 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= len(word_index):
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
                        
    return embedding_matrix, unknown_words

In [12]:
separate_chars = [
    ',', '.', '"', ':', ')', '(', '-', '!', '?', 
    '|', ';', "'", '$', '&', '/', '[', ']', '>', 
    '%', '=', '#', '*', '+', '\\', '•',  '~', '@', 
    '£', '·', '_', '{', '}', '©', '^', '®', '`',
    '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
    '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', 
    '\xa0', '\t', '“', '★', '”', '–', '●', 'â', 
    '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±',
    '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—',
    '‹', '─', '\u3000', '\u202f', '▒', '：', '¼', 
    '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', 
    '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', 
    '¾', 'Ã', '⋅', '‘', '∞', '«', '∙', '）', '↓', 
    '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', 
    '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
    '¹', '≤', '‡', '√', 
]
# mispell_dict = {
#     "aren't": "are not",
#     "can't": "cannot",
#     "couldn't": "could not",
#     "couldnt": "could not",
#     "didn't": "did not",
#     "doesn't": "does not",
#     "doesnt": "does not",
#     "don't": "do not",
#     "hadn't": "had not",
#     "hasn't": "has not",
#     "haven't": "have not",
#     "havent": "have not",
#     "he'd": "he would",
#     "he'll": "he will",
#     "he's": "he is",
#     "i'd": "I would",
#     "i'd": "I had",
#     "i'll": "I will",
#     "i'm": "I am",
#     "isn't": "is not",
#     "it's": "it is",
#     "it'll":"it will",
#     "i've": "I have",
#     "let's": "let us",
#     "mightn't": "might not",
#     "mustn't": "must not",
#     "shan't": "shall not",
#     "she'd": "she would",
#     "she'll": "she will",
#     "she's": "she is",
#     "shouldn't": "should not",
#     "shouldnt": "should not",
#     "that's": "that is",
#     "thats": "that is",
#     "there's": "there is",
#     "theres": "there is",
#     "they'd": "they would",
#     "they'll": "they will",
#     "they're": "they are",
#     "theyre":  "they are",
#     "they've": "they have",
#     "we'd": "we would",
#     "we're": "we are",
#     "weren't": "were not",
#     "we've": "we have",
#     "what'll": "what will",
#     "what're": "what are",
#     "what's": "what is",
#     "what've": "what have",
#     "where's": "where is",
#     "who'd": "who would",
#     "who'll": "who will",
#     "who're": "who are",
#     "who's": "who is",
#     "who've": "who have",
#     "won't": "will not",
#     "wouldn't": "would not",
#     "you'd": "you would",
#     "you'll": "you will",
#     "you're": "you are",
#     "you've": "you have",
#     "'re": " are",
#     "wasn't": "was not",
#     "we'll": " will",
#     "didn't": "did not",
#     "tryin'": "trying"
# }
number_pattern = re.compile("(\d+)")

def tokenize(s: str, lower: bool = False, split: str = " ") -> list:
    if lower:
        s = s.lower()
        
    # replacing part
    s = re.sub('[0-9]{5,}', '#####', s)
    s = re.sub('[0-9]{4}', '####', s)
    s = re.sub('[0-9]{3}', '###', s)
    s = re.sub('[0-9]{2}', '##', s)

    for c in separate_chars:
        s = s.replace(c, f" {c} ")
    
#     for k, v in mispell_dict.items():
#         s = s.replace(k, v)
        
    return s.split(split)
    

def build_vocab(train_data, test_data, fields, size: int = None):
    word_counter = Counter()
    for f in fields:
        for d in (train_data, test_data):
            word_counter.update(chain.from_iterable([tokenize(s) for s in d[f].values]))
    
    words = sorted([w for w, cnt in word_counter.most_common(size)])
    return words

In [13]:
%%time

words = build_vocab(train, test, text_columns)
len(words)

CPU times: user 1.58 s, sys: 15.5 ms, total: 1.59 s
Wall time: 1.59 s


60666

In [14]:
' '.join(words[:100])

' \t \n ! " # $ % & \' ( ) * + , - . / 0 0A 0AAK0AAL0AAM 0B 0C 0CAYQ 0Eh 0FC8 0Ghz 0L 0V 0a 0ac8 0ace 0b 0c 0d 0d0b7db 0dp 0ex 0f 0ghz 0i 0j 0jans 0jbWHZ 0l 0m 0mA 0mm 0pt 0px 0s 0sp 0th 0ubuntu1 0ubuntu2 0ubuntu9 0v 0woc6xL3 0x 0x0 0x0C 0x0E 0x0a 0x0b 0x0e 0x0f 0x1 0x1a 0x1b 0x1c 0x1d 0x1d6b 0x1e 0x1fe 0x2 0x2f 0x2f8 0x3 0x3C 0x3F 0x3c0 0x4 0x4A4D 0x4c 0x5 0x5E 0x6 0x6B 0x7 0x7F 0x7f8e3b 0x7fa 0x8 0x8b 0x9 0x9E 0xAB 0xB 0xBC 0xC'

In [15]:
idx2word = ["<space>", "<unk>"] + words
word2idx = {w: idx for idx, w in enumerate(idx2word)}
print('Number of known tokens -', len(word2idx))

Number of known tokens - 60668


In [16]:
from functools import partial

def str2tokens(s: str, t2i: dict) -> list:
    tokens = tokenize(s)
    return [t2i[t if t in t2i else "<unk>"] for t in tokens]

s2t = partial(str2tokens, t2i=word2idx)

In [17]:
%%time

crawl_matrix, unknown_words_crawl = build_matrix(word2idx, embeddings_dir / 'crawl-300d-2M.pkl')
print('n unknown words (crawl): ', len(unknown_words_crawl))

n unknown words (crawl):  12190
CPU times: user 4.06 s, sys: 1.09 s, total: 5.15 s
Wall time: 5.15 s


In [18]:
%%time

glove_matrix, unknown_words_glove = build_matrix(word2idx, embeddings_dir / 'glove.840B.300d.pkl')
print('n unknown words (glove): ', len(unknown_words_glove))

n unknown words (glove):  12127
CPU times: user 4.36 s, sys: 624 ms, total: 4.99 s
Wall time: 4.99 s


In [None]:
%%time

glove_matrix, unknown_words_glove = build_matrix(word2idx, embeddings_dir / 'glove.840B.300d.pkl')
print('n unknown words (glove): ', len(unknown_words_glove))

In [19]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
# embedding_matrix = crawl_matrix + glove_matrix
print(embedding_matrix.shape)

del crawl_matrix
del glove_matrix
gc.collect()

(60668, 600)


29

In [20]:
with open(embeddings_dir / 'glove_crawl_emb.pkl', 'wb') as f:
    embedding_matrix.dump(f)

embedding_matrix.shape

(60668, 600)

In [21]:
tmp = np.load(embeddings_dir / 'glove_crawl_emb.pkl', allow_pickle=True)

In [22]:
tmp.shape

(60668, 600)

In [23]:
for c in tqdm(text_columns):
    train[c] = train[c].apply(s2t)
    test[c] = test[c].apply(s2t)

100%|██████████| 3/3 [00:01<00:00,  1.84it/s]


In [24]:
unique_hosts = list(set(train['host'].unique().tolist() + test['host'].unique().tolist()))
unique_hosts = sorted(unique_hosts)

idx2host = unique_hosts
host2idx = {host: idx for idx, host in enumerate(unique_hosts)}

len(host2idx)

64

In [25]:
unique_categories = list(set(train['category'].unique().tolist() + test['category'].unique().tolist()))
unique_categories = sorted(unique_categories)

idx2category = unique_categories
category2idx = {cat: idx for idx, cat in enumerate(unique_categories)}

len(category2idx)

5

In [26]:
for d in tqdm((train, test)):
    d['host'] = d['host'].apply(lambda item: [host2idx[item]])
    
for d in tqdm((train, test)):
    d['category'] = d['category'].apply(lambda item: [category2idx[item]])

100%|██████████| 2/2 [00:00<00:00, 259.70it/s]
100%|██████████| 2/2 [00:00<00:00, 873.27it/s]


In [27]:
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,"[22738, 41113, 29867, 42017, 25131, 57850, 438...","[9961, 51742, 48948, 19, 39151, 45399, 23474, ...",Dylan,https://gaming.stackexchange.com/users/64471,"[20809, 39953, 44606, 29469, 35463, 41113, 298...",Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,[0],[24]
1,46,"[21777, 41501, 56312, 34764, 38639, 38845, 558...","[9961, 24487, 44399, 56312, 22846, 18, 2, 3836...",Anu,https://wordpress.stackexchange.com/users/72927,"[9961, 55971, 40059, 39953, 47662, 59243, 3049...",Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,[4],[63]
2,70,"[10779, 23635, 16, 2, 49480, 45565, 27160, 537...","[20980, 34250, 9961, 53931, 23367, 26823, 3602...",Konsta,https://gaming.stackexchange.com/users/37545,"[23212, 32311, 44784, 37596, 25121, 38845, 558...",Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,[0],[24]
3,132,"[19978, 36922, 24577, 9961, 19, 14397, 33796, ...","[9961, 37596, 57926, 43899, 16976, 15772, 2521...",robbannn,https://raspberrypi.stackexchange.com/users/17341,"[23216, 22644, 6109, 37498, 32628, 39953, 3192...",HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,[4],[45]
4,200,"[15545, 13970, 2, 17, 2, 8033, 3056, 15545, 45...","[9961, 37596, 26850, 5888, 17, 12223, 50519, 3...",Amit,https://travel.stackexchange.com/users/29089,"[9961, 27424, 57063, 46818, 59080, 59344, 3546...",Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,[0],[58]


In [28]:
# train_df = train
# valid_df = train

train_df, valid_df = train_test_split(train, test_size=0.2, random_state=2019)

print(train_df.shape)
print(valid_df.shape)

(4863, 41)
(1216, 41)


In [29]:
results_dir = data_dir


with open(results_dir / 'train.pkl', 'wb') as f:
    pickle.dump(train_df, f)
    
    
with open(results_dir / 'valid.pkl', 'wb') as f:
    pickle.dump(valid_df, f)
    
    
with open(results_dir / 'test.pkl', 'wb') as f:
    pickle.dump(test, f)

In [30]:
with open(data_dir/ 'vocab.pkl', 'wb') as f:
    pickle.dump((
        (idx2word, word2idx),
        (idx2host, host2idx),
        (idx2category, category2idx)
    ), f)