# Feature engineering on strings

In [4]:
import pandas as pd
import string
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import warnings
import gensim
from gensim.models import FastText
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import gensim.corpora as corpora

In [2]:
data = pd.read_csv("train.csv",header=None,low_memory=False)
data_test = pd.read_csv("test.csv",header=None,low_memory=False)

In [3]:
sentences = data[1][1:]
labels = data[2][1:]
sentences_test = []
translator = str.maketrans('', '', string.punctuation)

# Some analysis

In [7]:
vec = CountVectorizer().fit(sentences)
bag_of_words = vec.transform(sentences)
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [11]:
for word, freq in words_freq[:10]:
    print("Word " + "'\033[1m'"+str(word) +"'\033[0m'" + " appears " + str(freq) + " times.")

Word ''the'' appears 665950 times.
Word ''what'' appears 471294 times.
Word ''is'' appears 443182 times.
Word ''to'' appears 408009 times.
Word ''in'' appears 378153 times.
Word ''of'' appears 333515 times.
Word ''how'' appears 290405 times.
Word ''and'' appears 257922 times.
Word ''do'' appears 253252 times.
Word ''are'' appears 243038 times.


In [7]:
frequencies = np.array(words_freq)
frequencies = [int(x) for x in frequencies[:,1]]
distinct_words = int(len(vec.vocabulary_.items())/2)
print("Total number of words in the dataset are " + 
      "\033[1m"+str(np.sum(frequencies)) +"\033[0m"+ " of which " + 
      "\033[1m"+str(distinct_words)+"\033[0m" + " are distinct.")

Total number of words in the dataset are 15999712 of which 97500 are distinct.


In [8]:
for word, freq in words_freq[-97500:-97490]:
    print("Word " + "\033[1m'"+str(word) +"'\033[0m" + " appears one time.")

Word 'uniersity' appears one time.
Word 'wheath' appears one time.
Word 'subjested' appears one time.
Word 'notafcation' appears one time.
Word 'faulds' appears one time.
Word 'abody' appears one time.
Word '15260' appears one time.
Word 'localbitcoins' appears one time.
Word 'issil' appears one time.
Word 'c720' appears one time.


In [10]:
sincere_question_count = (bag_of_words[0].todense())
sincere_question_words = np.argwhere(sincere_question_count>=1)
for word in sincere_question_words:
    print("("+vec.get_feature_names()[word[1]] + " " 
          + str(sincere_question_count[0,word[1]])+")", end="  ")
print("\n" + sentences[1]+"\n")
lbls = labels.values
insincere_example = np.argwhere(lbls == '1')[1,0]
insincere_question_count = (bag_of_words[insincere_example].todense())
insincere_question_words = np.argwhere(insincere_question_count>=1)
for word in insincere_question_words:
    print("("+vec.get_feature_names()[word[1]] + " " 
          + str(insincere_question_count[0,word[1]]) + ")", end="  ")
print("\n" + sentences[insincere_example+1])

(1960s 1)  (as 1)  (did 1)  (how 1)  (in 1)  (nation 1)  (nationalists 1)  (province 1)  (quebec 1)  (see 1)  (the 1)  (their 1)  
How did Quebec nationalists see their province as a nation in the 1960s?

(are 1)  (babies 3)  (dark 1)  (light 1)  (more 1)  (or 1)  (parents 1)  (skin 2)  (sweeter 1)  (their 1)  (to 1)  (which 1)  
Which babies are more sweeter to their parents? Dark skin babies or light skin babies?


# Training the model with bags of words

In [11]:
Y = data[2][1:]
Y = Y.values
vectorizer = CountVectorizer(min_df=200)
sentences = data[1][1:]
X = vectorizer.fit_transform(list(sentences))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
LR_model = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)
preds = LR_model.predict(X_test)
preds = [int(x) for x in preds]
y_test = [int(x) for x in y_test]

In [13]:
print("F1 Score on LR for bags of words: " + str(f1_score(y_test,preds)))

F1 Score on LR for bags of words: 0.5180409368081718


# Term frequency- inverse document frequency (TF-IDF)

### Motivation

Instead of using the frequency of a word (that is, the number of times it appears in a sentence) in a given document (in our case, a sentence), we are going to furtherly tune the parameters so that extremly frequent words throughout the whole set of documents - like the words "the", "what", "is" and so on, shown above to have very high frequencies - will be taken less into acount, as they do not tell much about the nature of a given question.
### $$tfidf(t,d,D) = tf(t,d)\cdot idf(t,D)$$
The above function multiplies the frequency of a word $t$ in the current document (sentence in our case) by the inverse frequency of that word in all the documents. These two functions, $tf$ and $idf$ are defined as:
### $$idf(t,D)=log\frac{N}{|\{d\in D : t\in d\}|}$$ 
with
 - $N$: number of documents in the corpus.
 - $|\{d \in D : t \in d\}|$: number of documents in the corpus in which t appears.
 and for the term-frequency fucntion
### $$tf(t,d) = 0.5 + 0.5 \cdot \frac{f_{t,d}}{max\{f_{t',d}:t'\in d\}}$$
Which is an augmented frequency defined as the raw frequency of a word in the given document, divided by the maximum frequency of any element in that document (although mostly useless in our case, since this adjustment is done to prevent bias towards longer documents, and all of our documents are roughly the same size, them being only one question each).

In [14]:
tfidf_vec = TfidfVectorizer(min_df = 200)
X = tfidf_vec.fit_transform(list(sentences))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
LR_model = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)
preds = LR_model.predict(X_test)
preds = [int(x) for x in preds]
y_test = [int(x) for x in y_test]

In [16]:
print("F1 Score on LR for bags of words: " + str(f1_score(y_test,preds)))

F1 Score on LR for bags of words: 0.5232933579335793


# Word2Vec embeddings

### Skip gram and continuous bags of words

Summary of the Word2Vec methods:

#### Vocabulary
 - decide what vocabulary size (select words from the document based on their number of appearances)
 - for each word in the vocabulary, assign a unique index and with those indeces, create one_hot vectors for them $x=[0,0,...,1, ... 0]$ 


<img src="imgs/window.png"  style="width: 600px;">

#### Skip gram
 - Try predicting a context (words frequently found nearby), given a single word input.
 - Define a window size $n$ (for example, 5, as in the image below), and pick the words that are $\frac{n}{2}$ close to the word picked in a sentence (left and right)
 - Train a fully connected neural network with one hidden layer, predicting words found close to it. (For sentence "a b c d e", pick 'c' as the input and "a b d e" as the output).

<img src="imgs/SKIP_GRAM.png"  style="width: 400px;">

#### Continuous bags of words (CBOW)

 - similar to skip-gram, but "flip" the training inputs and outputs, i.e. predict a target output given context (words found close to it)

<img src="imgs/CBOW.png"  style="width: 500px;">

#### Gensim implementation: 

Edit the sentences (get rid of the punctuation, lower-case the strings and split them into words)

In [5]:
all_words = []
sentences = data[1][1:]
translator = str.maketrans('', '', string.punctuation)
embedding_sentences = []
for sentence in sentences:
    a = sentence.translate(translator).lower().split()
    embedding_sentences.append(a)
    all_words.extend(a)

Train a CBOW model, save it and use it after reloading to predict the most similar words in the dictionary to some words.

In [None]:
model = gensim.models.Word2Vec(
        embedding_sentences,
        size=300,
        window=5,
        min_count=80,
        seed=1,
        workers=10)
model.train(embedding_sentences, total_examples=len(embedding_sentences), epochs=10)

In [None]:
model.save("CBOW_model")

In [3]:
loaded_gbow_model  = gensim.models.Word2Vec.load("CBOW_model")

In [20]:
print(loaded_gbow_model.wv.similar_by_word("woman",5))
print(loaded_gbow_model.wv.similar_by_word("king",5))
print(loaded_gbow_model.wv.similar_by_word("dog",5))

[('man', 0.7577846050262451), ('girl', 0.7545576095581055), ('lady', 0.6998194456100464), ('guy', 0.6641252636909485), ('person', 0.6589857935905457)]
[('kings', 0.5375739336013794), ('emperor', 0.5216578841209412), ('queen', 0.5126441121101379), ('prince', 0.4900304079055786), ('julius', 0.48601898550987244)]
[('puppy', 0.7504028677940369), ('kitten', 0.6995419263839722), ('hamster', 0.6757329702377319), ('cat', 0.6668018102645874), ('pet', 0.63416588306427)]


Train a Skip-gram model, save it and use it after reloading to predict the most similar words in the dictionary to some words.

In [None]:
model = gensim.models.Word2Vec(
        embedding_sentences,
        size=300,
        window=5,
        seed=1,
        sg=1,
        min_count=80,
        workers=10)
model.train(embedding_sentences, total_examples=len(embedding_sentences), epochs=10)


In [None]:
model.save("SG_model")

In [21]:
load_sg_model = gensim.models.Word2Vec.load("SG_model")

In [22]:
print(load_sg_model.wv.similar_by_word("woman",5))
print(load_sg_model.wv.similar_by_word("king",5))
print(load_sg_model.wv.similar_by_word("dog",5))

[('man', 0.7577846050262451), ('girl', 0.7545576095581055), ('lady', 0.6998194456100464), ('guy', 0.6641252636909485), ('person', 0.6589857935905457)]
[('kings', 0.5375739336013794), ('emperor', 0.5216578841209412), ('queen', 0.5126441121101379), ('prince', 0.4900304079055786), ('julius', 0.48601898550987244)]
[('puppy', 0.7504028677940369), ('kitten', 0.6995419263839722), ('hamster', 0.6757329702377319), ('cat', 0.6668018102645874), ('pet', 0.63416588306427)]


### FastText
Extension of the Word2Vec models. The idea behind FastText is to use n-grams instead of words. An n-gram is a group of letters taken from the actual word (e.g., the 3-gram for "apple" will be "app", "ppl", "ple"), and the actual final embedding for the word will be the summ of all it's n-grams.

What is great about this method is that we can extract a context (meaning) vector even for words that do not exist at all in the dictionary we created.

In [None]:
model_ted = FastText(
    embedding_sentences, 
    size=300, 
    window=5, 
    min_count=80, 
    workers=10,
    sg=1)

In [None]:
model_ted.save("FT_model")

In [6]:
load_ft_model = gensim.models.FastText.load("FT_model")

In [24]:
print(load_ft_model.wv.similar_by_word("woman",5))
print(load_ft_model.wv.similar_by_word("king",5))
print(load_ft_model.wv.similar_by_word("dog",5))

[('womans', 0.7417962551116943), ('man', 0.7245641946792603), ('women', 0.6692850589752197), ('girl', 0.6474263072013855), ('lady', 0.5739828944206238)]
[('kings', 0.6914445161819458), ('kingdoms', 0.5868042707443237), ('caesar', 0.5731709003448486), ('queen', 0.5708640813827515), ('kingdom', 0.5651408433914185)]
[('dogs', 0.7345197200775146), ('puppy', 0.6820985078811646), ('kitten', 0.6240315437316895), ('kittens', 0.6132924556732178), ('puppies', 0.6027705669403076)]


Verify that "Gastroenteritis" is not present in the vocabulary.

In [25]:
word_fq_array = np.array(words_freq)
unique_words = dict.fromkeys(word_fq_array[:,0],1)
if "Gastroenteritis" not in unique_words:
    print("The word is not in the document")

The word is not in the document


Get similar meaning words according to the model. Notice how the fast text model will still produce decent similar words for the word "Gastroenteritis", even if we didn't have that word in the training corpus. 

In [24]:
print(load_ft_model.wv.most_similar("Gastroenteritis"))

  if np.issubdtype(vec.dtype, np.int):


[('arthritis', 0.6741937398910522), ('asthma', 0.5665794610977173), ('veterinary', 0.5183663368225098), ('herpes', 0.5168657302856445), ('infections', 0.5165085792541504), ('medicines', 0.5158151388168335), ('antibiotics', 0.5150102376937866), ('swelling', 0.5117443799972534), ('constipation', 0.5098469853401184), ('pneumonia', 0.5062140822410583)]


In [7]:
def train_lr_tf_model():
    i = 0
    Y = data[2][1:]
    Y = Y.values
    X = embedding_sentences
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05)
    X_train = np.array(X_train)
    insincere = X_train[y_train == '1'][:]
    sincere = X_train [y_train == '0'][:]
    a = []
    y = []
    for sent in insincere:
        word_vectors = [] 
        for single_word in sent:
            if single_word in load_ft_model.wv.vocab: 
                word_vectors.append(load_ft_model.wv.word_vec(single_word))
        word_vectors = np.array(word_vectors)
        if len(word_vectors) != 0:                
            if word_vectors.shape[1] == 300:
                a.append(np.mean(word_vectors,axis=0))
                y.append(1)
    for sent in sincere[:64674*4]:
        word_vectors = []  
        for single_word in sent:
            if single_word in load_ft_model.wv.vocab: 
                word_vectors.append(load_ft_model.wv.word_vec(single_word))
        word_vectors = np.array(word_vectors)
        if len(word_vectors) != 0:
            if word_vectors.shape[1] == 300:

                a.append(np.mean(word_vectors,axis=0))
                y.append(0)
    a = np.array(a)
    y = np.array(y)
    LR_model = LogisticRegression(random_state=0, solver='lbfgs').fit(a, y)

    x_test = []
    indexes = []
    for index, sent in enumerate(X_test):
        word_vectors = []     
        for single_word in sent:
            if single_word in load_ft_model.wv.vocab: 
                word_vectors.append(load_ft_model.wv.word_vec(single_word))
        word_vectors = np.array(word_vectors)
        if len(word_vectors) != 0:   
            if word_vectors.shape[1] == 300:
                x_test.append(np.mean(word_vectors,axis=0))
            else:
                indexes.append(index)
        else:
            indexes.append(indexes)
    
    preds = LR_model.predict(x_test)
    preds = [int(x) for x in preds]
    y_test = [int(x) for x in y_test]
    y_test = np.array(y_test)
    if len(indexes) != 0:
        y_test = np.delete(y_test,indexes)
    print("F1 Score on LR for Fast text sentence embeddings: " + str(f1_score(y_test,preds)))
    return LR_model


In [8]:
train_lr_tf_model()

F1 Score on LR for Fast text sentence embeddings: 0.5490042951971886


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

### Global Vectors for Word Representation (Glove)

 - matrix $X$ is the matrix of word co-occurances, in which $X_{ij}$ represents the number of times word $i$ occurs in the context of word $j$. 
 - $X_{i} = \sum_{k} X_{ik}$ be the total number of occurances in any context k for work i.
 - Finally, $P_{ij}=P(j|i)=X_{ij}/X_i$.

| Probability and Ratio |$k = solid$|$k = gas$|$k = water$|$k = fashion$|
|-----------------------|-----------|---------|-----------|-------------|
|  $P(k|ice)$           |$1.9\times 10^{-4}$|$6.6 \times 10^{-5}$|$3.0\times 10^{-3}$|$1.7\times 10^{-5}$|
|  $P(k|steam)$         |$2.2\times 10^{-5}$|$7.8\times 10^{-4}$|$2.2\times 10^{-3}$|$1.8\times 10^{-5}$|
|  $P(k|ice)/P(k|steam)$|$$8.9$$            |$$8.5\times 10^{-2}$$|$$1.36           $$|$$0.96$$|



Notice how the probability ratio for various words $k$ illustrates pretty well the semantic similarity of two words $i$ and $j$.
 - k related to ice and not to steam: high ratio
 - k related to steam but not to ice: small ratio
 - k related to both: ratio close to 1

The minimization function proposed by Glove tries to minimize the below J function. Note that $V$ here represents the number of words in the whole vocabulary and the $f$ function will weight down the very frequent words (similar to the problem of link words described in the Word2Vec approaches)

### $$J=\sum_{i,j=1}^{V} f(X_{ij})(w_i^T\tilde{w_j}+b_i+\tilde{b_j}-logX_{ij})^2$$

### $$f(X_{ij})= \begin{cases} 
      (x/x_{max})^\alpha & if x < x_{max} \\
      1 & otherwise \\
   \end{cases}
$$

In [8]:
glove_model =  spacy.load('en_core_web_lg')

In [9]:
glove_model.remove_pipe('ner')

('ner', <spacy.pipeline.EntityRecognizer at 0x166d97c9db0>)

In [4]:
X = sentences.values

In [37]:
glove_model(X[2]).vector;

In [60]:
Y = data[2][1:]
Y = Y.values
sentences = data[1][1:]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05)
insincere = X_train[y_train == '1']
sincere = X_train [y_train == '0']
a = []
y = []
for sent in insincere:
    a.append(glove_model(sent).vector)
    y.append(1)
for sent in sincere[:64674*4]:
    a.append(glove_model(sent).vector)
    y.append(0)

In [61]:
LR_model = LogisticRegression(random_state=0, solver='lbfgs').fit(a, y)

In [67]:
x_test = []
for sent in X_test:
    x_test.append(glove_model(sent).vector)
preds = LR_model.predict(x_test)
preds = [int(x) for x in preds]
y_test = [int(x) for x in y_test]


F1 Score on LR for bags of words: 0.5617253830389383


In [68]:
print("F1 Score on LR for Glove sentence embeddings: " + str(f1_score(y_test,preds)))

F1 Score on LR for Glove sentence embeddings: 0.5617253830389383


# Future considerations

#  ELMO

# References

### GLOVE:
http://www.foldl.me/2014/glove-python/

https://nlp.stanford.edu/pubs/glove.pdf

### asd: 