In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gensim
import re
import spacy 
import gensim.models.word2vec as w2v
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

In [28]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

def preproces_train_data_Word2Vect(df, save=True):
    #Drop nans
    df.dropna(inplace=True)
    #Build training data for Word2vect:
    #First separate question marks and words using regular expressions
    pattern = r"(\w+|[?!.])"
    sentences = list(np.append(df['question1'].values,df['question2'].values))
    sentences = [' '.join(re.findall(pattern, sent)) for sent in sentences]
    brief_cleaning = (re.sub("[^\w'?]+", ' ', str(row)).lower() for row in sentences)
    
    #Import spacy 
    nlp = spacy.load('en_core_web_sm') 
    
    #Use the spacy pipeline separating words etc
    txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000,  n_process=-1)]
    df_clean = pd.DataFrame({'clean': txt})
    df_clean = df_clean.dropna().drop_duplicates()
    if save:
        df_clean.to_csv('clean_w2v_data.csv', index=False)
    return df_clean

def preproces_data_Word2Vect(df):
    string_columns = [col for col in df.columns if df[col].dtype == 'object']
    out = []
    for col in string_columns:
        sentences = list(df[col].values)
        #First separate question marks and words using regular expressions
        pattern = r"(\w+|[?!.])"
        sentences = list(np.append(df['question1'].values,df['question2'].values))
        sentences = [' '.join(re.findall(pattern, sent)) for sent in sentences]
        brief_cleaning = (re.sub("[^\w'?]+", ' ', str(row)).lower() for row in sentences)
        
        #Import spacy 
        nlp = spacy.load('en_core_web_sm') 
        
        #Use the spacy pipeline separating words etc
        txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000,  n_process=-1)]
        out.append(txt)
    return np.array(out)

def train_Word2Vect(df_clean=pd.read_csv('clean_w2v_data.csv'), num_features = 300, num_epochs = 20,
                    min_word_count = 0, num_workers = multiprocessing.cpu_count(), context_size = 5, 
                    downsampling = 1e-3, seed = 1, sg = 0, save=True):
    
    word2vec = w2v.Word2Vec(
    sg=sg,
    seed=seed,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
    )
    
    word2vec.build_vocab(sent, progress_per=10000)
    word2vec.train(sent, total_examples=word2vec.corpus_count, epochs=30, report_delay=1)

    if save:
        word2vec.save("word2vec.model")
    return word2vec.wv

def sentence_to_wordlist(raw):
    clean = re.sub("^a-zA-Z", " ", raw)
    clean = clean.lower()
    words = clean.split()
    return words
    
def doc_to_vec(sentence, word2vec):
    word_list = sentence_to_wordlist(sentence)
    word_vectors = []
    for w in word_list:
        if w in word2vec.key_to_index.keys():
            word_vectors.append(word2vec[w])
    return np.mean(word_vectors, axis=0)

def get_Word2Vect(df, word2vec=Word2Vec.load("word2vec.model").wv, phrase2vec=doc_to_vec):
    data = preproces_data_Word2Vect(df, save=False)
    out = []
    if len(data.shape) >1:
        for val in data[:,0]:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
        for i in range(1, data.shape[1]):
            out = []
            for val in data[:,i]:
                out.append(doc_to_vec(val,word2vec))
            out = np.array(out)
            array = np.stack((array,out))
    else:
        for val in data:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
    return array

def get_Word2Vect_from_clean(df, word2vec=Word2Vec.load("word2vec.model").wv, phrase2vec=doc_to_vec):
    data = df.values
    out = []
    if len(data.shape) >1:
        for val in data[:,0]:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
        for i in range(1, data.shape[1]):
            out = []
            for val in data[:,i]:
                out.append(doc_to_vec(val,word2vec))
            out = np.array(out)
            array = np.stack((array,out))
    else:
        for val in data:
            out.append(doc_to_vec(val,word2vec))
        array = np.array(out)
    return array

In [2]:
#Import data
df = pd.read_csv("../nlp_deliv1_materials/quora_train_data.csv")
df.dropna(inplace=True)

In [4]:
df_clean = pd.read_csv('clean_w2v_data.csv')
word2vec = Word2Vec.load("word2vec.model").wv

Here is an example using a pre trained word to vector:

In [32]:
get_Word2Vect_from_clean(df_clean, word2vec=KeyedVectors.load("pretrained.model"), phrase2vec=doc_to_vec)

array([[-0.18908401,  0.14575334, -0.24050768, ..., -0.21747333,
         0.08525033,  0.17485468],
       [ 0.1206454 ,  0.1098704 ,  0.171647  , ..., -0.09448619,
        -0.08320801,  0.0506992 ],
       [-0.42076406,  0.17165159,  0.25065342, ..., -0.0953576 ,
         0.03478063,  0.1543766 ],
       ...,
       [-0.219928  , -0.029034  , -0.1162586 , ..., -0.072036  ,
        -0.038287  ,  0.24115232],
       [ 0.241744  ,  0.04891361,  0.0080844 , ...,  0.028048  ,
        -0.2740474 ,  0.479464  ],
       [-0.03195641, -0.10047483,  0.014281  , ..., -0.03649074,
        -0.0140646 ,  0.21292908]], dtype=float32)

In [19]:
get_Word2Vect(df_clean)

(2, 415665, 300)

In [38]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))
pre = gensim.downloader.load('glove-wiki-gigaword-300')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [61]:
pre.save("pretrained.model")

In [60]:
doc_to_vec(df['question1'][0][:-1], pre)

array([-0.20577545,  0.12746239, -0.04717724, -0.02534887,  0.04027225,
        0.152412  ,  0.03551663, -0.03253975,  0.08493976, -1.6322538 ,
        0.21031661, -0.1983688 , -0.11061212,  0.08063625, -0.05675174,
       -0.07887324, -0.12003312, -0.15204847,  0.21590875,  0.13083738,
        0.21441011,  0.40429187,  0.27155975, -0.01802613, -0.27786648,
       -0.00815025,  0.26502863, -0.16491945,  0.08708687, -0.01988237,
        0.06162221,  0.17012987, -0.3386976 , -0.19019699, -1.0490575 ,
        0.12061688, -0.23097901, -0.086456  ,  0.00959194,  0.04805613,
       -0.08873121, -0.27393562, -0.06129425, -0.12130875,  0.184645  ,
        0.13793913,  0.21891037,  0.13277763,  0.00689214,  0.02593046,
        0.10792162, -0.14590938,  0.00507363, -0.09312012, -0.23943488,
        0.39449978,  0.07809716, -0.09254488,  0.07620325,  0.08252013,
        0.4164734 , -0.090637  ,  0.07884928,  0.46675876, -0.14334024,
       -0.24197625,  0.1956075 ,  0.09268922,  0.157938  , -0.13