# Utils Alejandro Astruc Lopez

In [1]:
from utils import *
from utils_Alejandro import *

# Function definition

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gensim
import re
import spacy 
import gensim.models.word2vec as w2v
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import multiprocessing
from utils import *

'''
    These are the main functions to manipulate
    and work around word2vect. They use regular
    expressions, and a pretrained pipeline from
    spacy to perform a series of operations to
    the data before introducing it into the 
    word2vec model.
'''

def cleaning(doc):
    '''
        Cleaning routine that lemmatizaises
        data.
    '''
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

def preproces_train_data_Word2Vect(df, save=True):
    '''
        Preprocessing of the training data for the 
        word2vect data.
    '''
    #Drop nans
    df.dropna(inplace=True)
    #Build training data for Word2vect:
    #First separate question marks and words using regular expressions
    pattern = r"(\w+|[?!.])"
    sentences = list(np.append(df['question1'].values,df['question2'].values))
    sentences = [' '.join(re.findall(pattern, sent)) for sent in sentences]
    brief_cleaning = (re.sub("[^\w'?]+", ' ', str(row)).lower() for row in sentences)
    
    #Import spacy 
    nlp = spacy.load('en_core_web_sm') 
    
    #Use the spacy pipeline separating words etc
    txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000,  n_process=-1)]
    df_clean = pd.DataFrame({'clean': txt})
    df_clean = df_clean.dropna().drop_duplicates()
    if save:
        df_clean.to_csv('clean_w2v_data.csv', index=False)
    return df_clean

def preproces_data_Word2Vect(df):
    '''
        Cleaning the data that will be the
        input of the word2vect.
    '''
    string_columns = [col for col in df.columns if df[col].dtype == 'object']
    out = []
    for col in string_columns:
        sentences = list(df[col].values)
        #First separate question marks and words using regular expressions
        pattern = r"(\w+|[?!.])"
        sentences = list(np.append(df['question1'].values,df['question2'].values))
        sentences = [' '.join(re.findall(pattern, sent)) for sent in sentences]
        brief_cleaning = (re.sub("[^\w'?]+", ' ', str(row)).lower() for row in sentences)
        
        #Import spacy 
        nlp = spacy.load('en_core_web_sm') 
        
        #Use the spacy pipeline separating words etc
        txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000,  n_process=-1)]
        out.append(txt)
    return np.array(out)

def train_Word2Vect(df_clean, num_features = 300, num_epochs = 20,
                    min_word_count = 0, num_workers = multiprocessing.cpu_count(), context_size = 5, 
                    downsampling = 1e-3, seed = 1, sg = 0, save=True):
    '''
        Function for training the word2vect from
        already cleaned data.
    '''
    
    word2vec = w2v.Word2Vec(
    sg=sg,
    seed=seed,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
    )
    
    word2vec.build_vocab(sent, progress_per=10000)
    word2vec.train(sent, total_examples=word2vec.corpus_count, epochs=30, report_delay=1)

    if save:
        word2vec.save("word2vec.model")
    return word2vec.wv

def sentence_to_wordlist(raw):
    '''
        Routine to separate sentence into
        wordlist.
    '''
    clean = re.sub("[^a-zA-Z0-9]", " ", str(raw))
    clean = clean.lower()
    words = clean.split()
    return words
    
def doc_to_vec(sentence, word2vec):
    '''
        Routine to convert sentence into vector
        given a word2vect keyevector.
    '''
    word_list = sentence_to_wordlist(sentence)
    word_vectors = []
    for w in word_list:
        if word2vec.__contains__(w):
            if word2vec.key_to_index[w] < len(word2vec.vectors):
                word_vectors.append(word2vec[str(w)])
    return np.mean(word_vectors, axis=0)

def get_Word2Vect(df, word2vec=Word2Vec.load("word2vec.model").wv, phrase2vec=doc_to_vec):
    '''
        Function that receives data to be cleaned and
        then converts it to vectors.
    '''
    data = preproces_data_Word2Vect(df)
    out = []
    if len(data.shape) >1:
        for val in data[:,0]:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
        for i in range(1, data.shape[1]):
            out = []
            for val in data[:,i]:
                out.append(doc_to_vec(val,word2vec))
            out = np.array(out)
            array = np.stack((array,out))
    else:
        for val in data:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
    return array

def get_Word2Vect_from_clean(df, word2vec=Word2Vec.load("word2vec.model").wv, phrase2vec=doc_to_vec):
    '''
        Function that receives celan data and
        then converts it to vectors.

        A suggeste input to word2vec is:
        word2vec = KeyedVectors.load("pretrained.model")

        "pretrained.model" is a keyevector from the 
        pretrained model: glove-wiki-gigaword-300
        you can also download it as such:
        
        gensim.downloader.load('glove-wiki-gigaword-300')
        
        I recommend saving with .save('file_path')
        as it will be faster for future use:
        
        pre.save("pretrained.model")
    '''
    data = df.values
    out = []
    if len(data.shape) >1:
        for val in data[:,0]:
            out.append(doc_to_vec(val,word2vec))
        array = out[:]
        for i in range(1, data.shape[1]):
            out = []
            for val in data[:,i]:
                out.append(doc_to_vec(val,word2vec))
            array = [array,out]
    else:
        for val in data:
            out.append(doc_to_vec(val,word2vec))
        array = out[:]
    return array

# Training and tests of word2vect

In [2]:
#Import data
df = pd.read_csv("../nlp_deliv1_materials/quora_train_data.csv")
df.dropna(inplace=True)

In [4]:
df_clean = pd.read_csv('clean_w2v_data.csv')
word2vec = Word2Vec.load("word2vec.model").wv

Here is an example using a pre trained word to vector:

In [32]:
get_Word2Vect_from_clean(df_clean, word2vec=KeyedVectors.load("pretrained.model"), phrase2vec=doc_to_vec)

array([[-0.18908401,  0.14575334, -0.24050768, ..., -0.21747333,
         0.08525033,  0.17485468],
       [ 0.1206454 ,  0.1098704 ,  0.171647  , ..., -0.09448619,
        -0.08320801,  0.0506992 ],
       [-0.42076406,  0.17165159,  0.25065342, ..., -0.0953576 ,
         0.03478063,  0.1543766 ],
       ...,
       [-0.219928  , -0.029034  , -0.1162586 , ..., -0.072036  ,
        -0.038287  ,  0.24115232],
       [ 0.241744  ,  0.04891361,  0.0080844 , ...,  0.028048  ,
        -0.2740474 ,  0.479464  ],
       [-0.03195641, -0.10047483,  0.014281  , ..., -0.03649074,
        -0.0140646 ,  0.21292908]], dtype=float32)

In [19]:
get_Word2Vect(df_clean)

(2, 415665, 300)

In [38]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))
pre = gensim.downloader.load('glove-wiki-gigaword-300')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [61]:
pre.save("pretrained.model")

In [60]:
doc_to_vec(df['question1'][0][:-1], pre)

array([-0.20577545,  0.12746239, -0.04717724, -0.02534887,  0.04027225,
        0.152412  ,  0.03551663, -0.03253975,  0.08493976, -1.6322538 ,
        0.21031661, -0.1983688 , -0.11061212,  0.08063625, -0.05675174,
       -0.07887324, -0.12003312, -0.15204847,  0.21590875,  0.13083738,
        0.21441011,  0.40429187,  0.27155975, -0.01802613, -0.27786648,
       -0.00815025,  0.26502863, -0.16491945,  0.08708687, -0.01988237,
        0.06162221,  0.17012987, -0.3386976 , -0.19019699, -1.0490575 ,
        0.12061688, -0.23097901, -0.086456  ,  0.00959194,  0.04805613,
       -0.08873121, -0.27393562, -0.06129425, -0.12130875,  0.184645  ,
        0.13793913,  0.21891037,  0.13277763,  0.00689214,  0.02593046,
        0.10792162, -0.14590938,  0.00507363, -0.09312012, -0.23943488,
        0.39449978,  0.07809716, -0.09254488,  0.07620325,  0.08252013,
        0.4164734 , -0.090637  ,  0.07884928,  0.46675876, -0.14334024,
       -0.24197625,  0.1956075 ,  0.09268922,  0.157938  , -0.13

# Use in a logistic model

## 1. Data preparation

In [2]:
#Import data and drop nans
df = pd.read_csv("../nlp_deliv1_materials/quora_train_data.csv")
df.dropna(inplace=True)
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1
...,...,...,...,...,...,...
323427,192476,292119,292120,Is it okay to use a laptop while it is chargin...,Is it OK to use your phone while charging?,0
323428,17730,33641,33642,How can dogs understand human language?,Can dogs understand the human language?,0
323429,28030,52012,52013,What's your favourite lotion?,What's your favourite skin lotion?,1
323430,277869,397054,120852,How does one become a hedge fund manager?,What should I do to become a hedge fund manager?,1


In [4]:
#First we clean a prepare the data for vectorization
#We used the cleaning functions defined by Alba.
q1 = preprocess_data(df['question1'])
q2 = preprocess_data(df['question1'])
df_clean = pd.DataFrame({'q1': q1, 'q2':q2})
df_clean.to_csv('clean_w2v_data.csv', index=False)

## 2. Vectorize data

In [3]:
#We proceed to vetorize the entences with the functions we defined
df_clean = pd.read_csv('clean_w2v_data.csv')
vec = get_Word2Vect_from_clean(df_clean)# KeyedVectors.load("pretrained.model"))
df['q1'] = vec[0]
df['q2'] = vec[1]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Because of the limited word2vect we trained some sentences cannot be translated to a vector. Further on we will use a pretrained word to vect.

In [17]:
df.dropna(inplace=True)
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1,q2
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1,"[-0.24570484, -0.30564713, -0.07970454, 0.1097...","[-0.24570484, -0.30564713, -0.07970454, 0.1097..."
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1,"[-0.42143765, 0.11858634, -0.05358868, -0.2265...","[-0.42143765, 0.11858634, -0.05358868, -0.2265..."
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0,"[-0.57006973, -0.58232933, -0.2590795, -1.1337...","[-0.57006973, -0.58232933, -0.2590795, -1.1337..."
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0,"[-0.034957096, 0.56301516, 0.6061619, 0.91554,...","[-0.034957096, 0.56301516, 0.6061619, 0.91554,..."
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1,"[0.0910532, -0.39877188, -0.3537015, -0.256290...","[0.0910532, -0.39877188, -0.3537015, -0.256290..."
...,...,...,...,...,...,...,...,...
323427,192476,292119,292120,Is it okay to use a laptop while it is chargin...,Is it OK to use your phone while charging?,0,"[-0.20693651, -0.1512374, -0.0787127, -1.18186...","[-0.20693651, -0.1512374, -0.0787127, -1.18186..."
323428,17730,33641,33642,How can dogs understand human language?,Can dogs understand the human language?,0,"[0.6491662, -0.32742617, -0.045254048, 0.11678...","[0.6491662, -0.32742617, -0.045254048, 0.11678..."
323429,28030,52012,52013,What's your favourite lotion?,What's your favourite skin lotion?,1,"[-0.74212116, -0.5065593, 0.42237905, 0.066984...","[-0.74212116, -0.5065593, 0.42237905, 0.066984..."
323430,277869,397054,120852,How does one become a hedge fund manager?,What should I do to become a hedge fund manager?,1,"[0.0831171, -0.85846317, 0.5749842, -0.3321251...","[0.0831171, -0.85846317, 0.5749842, -0.3321251..."


## 3. Train model

In [58]:
X = np.hstack((np.array([x for x in df['q1'].values]),np.array([x for x in df['q2'].values])))
y = df['is_duplicate'].values

X_, X_test, y_, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.05, random_state=123)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_, y_, test_size=0.05, random_state=123)

In [None]:
logistic1 = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic1.fit(X_train, y_train)

In [60]:
train_metrics1 = evaluate_model(X_train, y_train, model=logistic1, display=False)
train_metrics1

{'accuracy': 0.674610512134012,
 'roc_auc': 0.5922226268127543,
 'precision': 0.6347794633928382,
 'recall': 0.27792890147562976,
 'f1': 0.3865934963036873}

In [61]:
validation_metrics1 = evaluate_model(X_val, y_val, model=logistic1, display=False)
validation_metrics1

{'accuracy': 0.6712158808933002,
 'roc_auc': 0.5921966075376667,
 'precision': 0.6304262807978099,
 'recall': 0.28270782181690635,
 'f1': 0.3903620293013682}

In [62]:
test_metrics1  = evaluate_model(X_test, y_test, model=logistic1, display=False)
test_metrics1

{'accuracy': 0.676302729528536,
 'roc_auc': 0.5947708881807898,
 'precision': 0.641113653699466,
 'recall': 0.2821416582745888,
 'f1': 0.39184149184149186}

## 4. Train model with pretrained w2v

In [67]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')
glove_vectors.save("pretrained.model")

In [None]:
df_clean = pd.read_csv('clean_w2v_data.csv')
vec = get_Word2Vect_from_clean(df_clean, KeyedVectors.load("pretrained.model"))

In [69]:
df = pd.read_csv("../nlp_deliv1_materials/quora_train_data.csv")
df.dropna(inplace=True)
df['q1'] = vec[0]
df['q2'] = vec[1]
df.dropna(inplace=True)

In [70]:
X = np.hstack((np.array([x for x in df['q1'].values]),np.array([x for x in df['q2'].values])))
y = df['is_duplicate'].values

X_, X_test, y_, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.05, random_state=123)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_, y_, test_size=0.05, random_state=123)

logistic2 = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic2.fit(X_train, y_train)

In [71]:
train_metrics2 = evaluate_model(X_train, y_train, model=logistic2, display=False)
train_metrics2

{'accuracy': 0.6865039124059583,
 'roc_auc': 0.6213790993783148,
 'precision': 0.6266878970948657,
 'recall': 0.3726169039707438,
 'f1': 0.4673539118843751}

In [72]:
validation_metrics2 = evaluate_model(X_val, y_val, model=logistic2, display=False)
validation_metrics2

{'accuracy': 0.6890581266679685,
 'roc_auc': 0.6197319858461495,
 'precision': 0.6243499541144081,
 'recall': 0.36511627906976746,
 'f1': 0.4607743537645333}

In [73]:
test_metrics2  = evaluate_model(X_test, y_test, model=logistic2, display=False)
test_metrics2

{'accuracy': 0.6874227059114519,
 'roc_auc': 0.6215467097064837,
 'precision': 0.6256352343308865,
 'recall': 0.3727502102607233,
 'f1': 0.46716559502477073}