# Word Embeddings
Word embedding binary can be found from the competition page along with the data https://www.kaggle.com/c/quora-insincere-questions-classification

In [3]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

import numpy as np
import pandas as pd
from IPython.display import display
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm
tqdm.pandas()
from functools import partial
from collections import defaultdict

from gensim.models import KeyedVectors
from nltk import word_tokenize
from ecprocessing import text_preprocessing as txp

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics




In [7]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = metrics.f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Import data and preprocess

In [8]:
## Load in data
train_df = pd.read_csv('../../data/train.csv')
test_df = pd.read_csv('../../data/test.csv')

X_train = train_df.question_text
y_train = train_df.target

X_test = test_df.question_text

In [9]:
## Preprocess
normalize_text_lem = partial(txp.normalize_text, text_stemming='Lem')
text_normalizer_lem = FunctionTransformer(lambda x: x.apply(normalize_text_lem), validate=False)
text_normalizer_stem = FunctionTransformer(lambda x: x.apply(txp.normalize_text), validate=False)


In [11]:
#X_train_norm = text_normalizer_lem.fit_transform(X_train.fillna('NA'))
#X_test_norm = text_normalizer_lem.fit_transform(X_test)

In [12]:
%%time
X_train_norm_s = text_normalizer_stem.fit_transform(X_train)
#X_test_norm_s = text_normalizer_stem.fit_transform(X_test)

Wall time: 8min 46s


## Word Embedding

In [4]:
# Loading word2vec embedding, easy to do with gensim KeyedVectors
EMBEDDING_FILE = '../../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
wv = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [3]:
wv.most_similar('hello')

[('hi', 0.6548984050750732),
 ('goodbye', 0.639905571937561),
 ('howdy', 0.6310957074165344),
 ('goodnight', 0.5920578241348267),
 ('greeting', 0.5855877995491028),
 ('Hello', 0.5842196941375732),
 ("g'day", 0.5754078030586243),
 ('See_ya', 0.5688872337341309),
 ('ya_doin', 0.5643120408058167),
 ('greet', 0.5636603832244873)]

In [5]:
print(wv.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


In [7]:
wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [5]:
wv['hello']

array([-0.05419922,  0.01708984, -0.00527954,  0.33203125, -0.25      ,
       -0.01397705, -0.15039062, -0.265625  ,  0.01647949,  0.3828125 ,
       -0.03295898, -0.09716797, -0.16308594, -0.04443359,  0.00946045,
        0.18457031,  0.03637695,  0.16601562,  0.36328125, -0.25585938,
        0.375     ,  0.171875  ,  0.21386719, -0.19921875,  0.13085938,
       -0.07275391, -0.02819824,  0.11621094,  0.15332031,  0.09082031,
        0.06787109, -0.0300293 , -0.16894531, -0.20800781, -0.03710938,
       -0.22753906,  0.26367188,  0.012146  ,  0.18359375,  0.31054688,
       -0.10791016, -0.19140625,  0.21582031,  0.13183594, -0.03515625,
        0.18554688, -0.30859375,  0.04785156, -0.10986328,  0.14355469,
       -0.43554688, -0.0378418 ,  0.10839844,  0.140625  , -0.10595703,
        0.26171875, -0.17089844,  0.39453125,  0.12597656, -0.27734375,
       -0.28125   ,  0.14746094, -0.20996094,  0.02355957,  0.18457031,
        0.00445557, -0.27929688, -0.03637695, -0.29296875,  0.19

## Feature extraction

In [14]:
class EmbeddingVectorizer(BaseEstimator, TransformerMixin):
    '''Takes a word embedding and transforms documents into embedded vectors
       by calculating embedding for each word and then combining
       Can specify to use a weighted sum by fitting a Tfidf Vectorizer and using those weights'''
    
    def __init__(self, wv, weighted_vec=True, max_df=1., min_df=int(1)):
        self.wv = wv
        self.weighted_vec = weighted_vec
        self.max_df = max_df
        self.min_df = min_df
        self.dim = wv.vector_size
        
    def fit(self, X):
        if self.weighted_vec:
            self.tfidf_vec = TfidfVectorizer(max_df=self.max_df, min_df=self.min_df)
            self.tfidf_vec.fit(X)
            # if a word was never seen - it must be at least as infrequent
            # as any of the known words - so the default idf is the max of 
            # known idf's
            
            max_idf = max(self.tfidf_vec.idf_) 
            self.tfidf_dict = defaultdict(lambda: max_idf, zip(self.tfidf_vec.get_feature_names(), self.tfidf_vec.idf_))
        return self
        
    def DocToWordVector(self, doc):
        tokens = word_tokenize(doc)
        vec = np.zeros(self.dim).reshape((1, self.dim))
        count = 0.
    
        for word in tokens:
            if self.weighted_vec:
                weight = self.tfidf_dict[word]
            else:
                weight = 1
            try:
                vec += self.wv[word].reshape((1, self.dim)) * weight
                count += 1.
            except KeyError:      # handling the case where the token is not
                                  # in the word embedding
                continue
    
        if count != 0:
            vec = vec / count
        return vec
            
    def transform(self, X):
        transformed_X = np.vstack(np.array(list(map(self.DocToWordVector, X))))
        return transformed_X
        

In [15]:
# Can take a while to run
%%time
embed_transformer = EmbeddingVectorizer(wv, weighted_vec=True)
X_train_wv = embed_transformer.fit_transform(X_train_norm_s)

Wall time: 4min 17s


## Modelling
### Logistic Regression

In [16]:
# Train test split
X_train_mdl, X_test_mdl, y_train_mdl, y_test_mdl = train_test_split(X_train_wv, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [17]:
%%time
# Train model
clf = LogisticRegression()
clf.fit(X_train_mdl, y_train_mdl)



Wall time: 1min 55s


In [18]:
test_predictions = clf.predict(X_test_mdl)
print(metrics.confusion_matrix(y_test_mdl, test_predictions))
print(metrics.f1_score(y_test_mdl, test_predictions))

[[303637   2691]
 [ 17763   2440]]
0.19262650982868873


In [19]:
predictions_proba = clf.predict_proba(X_test_mdl)
search_result = threshold_search(y_test_mdl, predictions_proba[:,1])
print(search_result)

{'threshold': 0.14, 'f1': 0.41263830103215265}


Doesn't perform very well, however we now have an input size of only 300 so we can turn to deep learning to increase performance.