In [1]:
"""
Here we use the fasttext embeddings and 
convert our sentences into 300 dimensional vectors
"""
import numpy as np
import pandas as pd
from numpy import save
from numpy import load

from sklearn import model_selection

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')

import re, string
import io
import sys

import time

In [2]:
# Read trainig data
df= pd.read_hdf('../input/tiny_data/train_tiny.h5',key='dataset')
df = df[["PostId","Title","BodyMarkdown","OpenStatus"]]
y = df.OpenStatus.values

In [3]:
df

Unnamed: 0,PostId,Title,BodyMarkdown,OpenStatus
2386616,8844866,Finding how many rows needed given number of b...,I want to layout X buttons.\r\n\r\nAt the star...,open
534173,2659366,Java Interfaces Methodology,I've been programming in Java for a few course...,open
3258915,10940049,execute code in rails after response is sent t...,"I want to run code (specifically, Garbage Coll...",open
2030482,7776743,Downloading a word Document from Gmail using J...,"I'm using Javamail API, and I'm trying to down...",open
98790,679979,C: How to make a variadic macro (variable numb...,I want to write a macro in C that accepts any ...,open
3335247,11465224,Build script for Visual Studio and Qt projects,"I have an C++ application, which consists of s...",open
5518,47862,Upgrade database from SQL Server 2000 to 2005 ...,I'm loading a SQL Server 2000 database into my...,open


In [18]:
#punctuation = list(string.punctuation)

my_stopwords = nltk.corpus.stopwords.words('english')# punctuation

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet =str(tweet).lower()
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def tokenize(s):
    s=str(s).lower()
    output = re.sub(r'\d+', '', s) # remove numbers 
    output = remove_links(output)
    result = re_tok.sub(r' \1 ', output).split() 
    result = [word for word in result if len(word)>2]
    result = [word for word in result if word not in my_stopwords]
    return result

# convert sentences to vectors from embedding, embedding is 300 dimensional
def sentence_to_vec(s,embedding_dict,stop_words,tokenizer):
    """
    s: sentence, string
    embedding_dict: dictionary word: vector
    stop_words: list of stop words
    tokenizer: tokenizer function
    """
    # tokenize the sentence
    words = s
    words = tokenizer(words)
    
    # keep only alpha numeric tokens
    words =[w for w in words if w.isalpha()]
    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for every word, get the embedding from the dictionary
        # and append to the list of embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])
    # if we don't have any vectors return zeros
    if len(M)==0:
        return np.zeros(300)
    # convert list of embeddings to array
    M = np.array(M)
    # calculate sum over axis=0
    v = M.sum(axis=0)
    return v/np.sqrt((v**2).sum())       


def load_vectors(fname):
    fin = io.open(
        fname,'r',encoding ='utf-8',
        newline = '\n',
        errors='ignore'
        )
    n,d = map(int,fin.readline().split())
    data ={}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]]=list(map(float,tokens[1:]))
    return data    


#create sentence embeddings
def get_vectors(vectors,column,embeddings):
    """
    vectors: empty array to fill in
    column: dataframe column
    return embedding vectors
    """
    for body in column:
        vectors.append(
            sentence_to_vec(s= body,
                           embedding_dict = embeddings,
                           stop_words =my_stopwords,
                           tokenizer=tokenize
                           )
        )
    return vectors  

In [5]:
 t0_total =time.time()

    # Read embeddings
t0=time.time()
print("Loading embeddings")
embeddings = load_vectors("../input/crawl-300d-2M.vec")  
    
t1 =time.time()
total_time=t1-t0
print("time to load", total_time)
print(" ")

Loading embeddings
time to load 93.06966423988342
 


In [163]:
def get_sentence_embedding(df,column=None):    
    """
    param
    df: dataframe
    column: datafrane column to embedd
    return: dataframe with embedding.dim (#of rows, 301)
    """
    #Create empty dataframe
    
    data=pd.DataFrame([])      
        
    # loop over the postid
    
    for i in df.PostId:
        B=df.loc[df.PostId==i]    
        B=B[column].values
        # get the embeddings
        X= sentence_to_vec(s=[B],
                           embedding_dict = embeddings,
                           stop_words =my_stopwords,
                           tokenizer=tokenize
                           )
        
        data1=pd.DataFrame(data=X)
        data1=data1.T
        data1=data1.add_suffix('_'+column)
        #add id column
        data1["id"]=i
        data1.columns=data1.columns.astype(str)
        data=data.append(data1)        
    return data

def get_target(df,column=None):
    
    data=pd.DataFrame([])
    
    for i in df.PostId:
        B=df.loc[df.PostId==i]    
        B=B[column].values
        data1=pd.DataFrame(data=B)
        data1=data1.T
        #add id column
        data1["id"]=i
        data1.columns=data1.columns.astype(str)
        data1=data1.rename(columns={'0':'target'})
        data=data.append(data1)        
    return data
        

In [154]:
title_appended = get_sentence_embedding(df,'Title') 
print(data_appended.shape)
title_appended.to_hdf("../input/tiny_data/title.h5",
         key='dataset',mode ='w',index= False)
(title_appended.head(7))

(7, 301)


Unnamed: 0,0_Title,1_Title,2_Title,3_Title,4_Title,5_Title,6_Title,7_Title,8_Title,9_Title,...,291_Title,292_Title,293_Title,294_Title,295_Title,296_Title,297_Title,298_Title,299_Title,id
0,-0.065509,0.103647,-0.035116,-0.032617,-0.075866,0.025417,-0.054854,0.02088,0.068233,-0.022143,...,0.03255,0.006832,-0.035522,0.07696,0.012868,-0.033474,0.014736,-0.055689,-0.018701,8844866
0,-0.07235,0.148529,-0.025573,0.024086,-0.033714,0.041064,-0.040374,0.017315,-0.018273,0.017804,...,0.069499,0.008832,-0.038142,0.103691,0.055951,-0.002821,0.000312,0.005471,-0.022646,2659366
0,-0.066269,0.116832,-0.078549,-0.00029,-0.058342,0.018301,-0.042047,-0.020272,-0.002478,0.040167,...,0.023469,0.069197,-0.04151,0.042437,0.078289,-0.033847,0.0105,-0.080802,0.011414,10940049
0,0.004414,0.114957,-0.047366,-0.001067,-0.020975,0.029123,-0.010478,0.007674,-0.014058,0.028262,...,0.038866,0.062495,-0.007892,0.1014,0.088833,-0.005941,-0.007884,0.004912,0.022956,7776743
0,-0.053857,0.123921,-0.050572,-0.027107,-0.054316,0.014609,-0.058074,-0.003075,0.004396,0.042438,...,0.062737,0.041506,-0.042645,0.079617,0.052966,0.005897,-0.002458,-0.034936,0.02533,679979
0,-0.09212,0.119685,-0.014229,0.05151,-0.051558,0.063547,-0.050531,0.009802,0.000228,0.028074,...,0.01968,-0.010401,-0.037961,0.064066,0.066396,-0.013145,0.010596,-0.037871,-0.008803,11465224
0,-0.04629,0.142331,-0.004897,0.004013,-0.033733,-0.021379,-0.036681,-0.011259,0.002504,0.01183,...,-0.008594,-0.012039,-0.028655,0.079287,0.064119,-0.027943,0.021516,0.016858,0.022841,47862


In [142]:
BodyMarkDown_appended = get_sentence_embedding(df,'BodyMarkdown')
print(BodyMarkDown_appended.shape)
BodyMarkDown_appended.to_hdf(
    "../input/tiny_data/BodyMarkDown.h5",
    key='dataset',
    mode ='w',
    index= False)
(BodyMarkDown_appended.head(7))

(7, 301)


Unnamed: 0,0_BodyMarkdown,1_BodyMarkdown,2_BodyMarkdown,3_BodyMarkdown,4_BodyMarkdown,5_BodyMarkdown,6_BodyMarkdown,7_BodyMarkdown,8_BodyMarkdown,9_BodyMarkdown,...,291_BodyMarkdown,292_BodyMarkdown,293_BodyMarkdown,294_BodyMarkdown,295_BodyMarkdown,296_BodyMarkdown,297_BodyMarkdown,298_BodyMarkdown,299_BodyMarkdown,id
0,-0.071879,0.034939,-0.053535,-0.007968,-0.009871,0.05164,0.041489,-0.004621,0.119302,-0.023557,...,0.011671,0.00709,-0.123104,0.063124,0.013623,-0.011971,-0.023992,-0.087915,-0.033759,8844866
0,-0.06044,0.09338,-0.036407,-0.027118,0.000425,-0.01746,-0.04628,0.049422,-0.012093,-0.037352,...,0.06034,-0.01793,-0.062969,0.022482,0.038964,0.009898,-0.00185,-0.022312,-0.021957,2659366
0,-0.034124,0.024924,-0.047072,-0.00871,-0.034748,0.027694,-0.05712,0.001856,0.018673,-0.010602,...,-0.028228,0.048713,-0.047364,-0.011032,0.004505,0.007254,-0.006658,-0.04266,0.009221,10940049
0,-0.079857,0.089469,-0.05247,0.026044,0.003003,0.059799,0.007606,-0.020565,0.032908,-0.027515,...,0.011755,0.029449,-0.011121,0.084332,0.031011,-0.034971,0.01247,-0.028939,0.046105,7776743
0,-0.054652,0.078058,-0.055095,-0.041828,-0.006715,0.019636,-0.039402,0.015707,-0.003959,0.019515,...,0.053439,0.027172,-0.026234,0.051671,-0.019979,-0.011625,-0.018657,-0.035646,0.025895,679979
0,-0.094,0.082542,-0.041527,0.05828,-0.045604,0.062993,-0.040898,0.003576,0.027184,-0.024684,...,0.024547,-0.004932,-0.080273,-0.000455,-0.043256,-0.002785,0.003154,-0.049467,-0.006905,11465224
0,-0.066644,0.068971,-0.001054,-0.024719,0.004286,-0.000992,-0.007656,0.031489,0.025537,-0.025766,...,-0.034858,-0.016276,-0.029664,0.027106,0.027174,0.013064,0.008896,-0.007064,0.022437,47862


In [None]:
target = get_target(df,"OpenStatus")
target.head()
target.to_hdf(
    "../input/tiny_data/target.h5",
    key='dataset',
    mode ='w',
    index= False)

#### Merge the two embeddings

In [143]:
tt=pd.read_hdf("../input/tiny_data/title.h5",key='dataset',mode ='r')
tb=pd.read_hdf("../input/tiny_data/BodyMarkDown.h5",key='dataset',mode ='r')
tb.head()

Unnamed: 0,0_BodyMarkdown,1_BodyMarkdown,2_BodyMarkdown,3_BodyMarkdown,4_BodyMarkdown,5_BodyMarkdown,6_BodyMarkdown,7_BodyMarkdown,8_BodyMarkdown,9_BodyMarkdown,...,291_BodyMarkdown,292_BodyMarkdown,293_BodyMarkdown,294_BodyMarkdown,295_BodyMarkdown,296_BodyMarkdown,297_BodyMarkdown,298_BodyMarkdown,299_BodyMarkdown,id
0,-0.071879,0.034939,-0.053535,-0.007968,-0.009871,0.05164,0.041489,-0.004621,0.119302,-0.023557,...,0.011671,0.00709,-0.123104,0.063124,0.013623,-0.011971,-0.023992,-0.087915,-0.033759,8844866
0,-0.06044,0.09338,-0.036407,-0.027118,0.000425,-0.01746,-0.04628,0.049422,-0.012093,-0.037352,...,0.06034,-0.01793,-0.062969,0.022482,0.038964,0.009898,-0.00185,-0.022312,-0.021957,2659366
0,-0.034124,0.024924,-0.047072,-0.00871,-0.034748,0.027694,-0.05712,0.001856,0.018673,-0.010602,...,-0.028228,0.048713,-0.047364,-0.011032,0.004505,0.007254,-0.006658,-0.04266,0.009221,10940049
0,-0.079857,0.089469,-0.05247,0.026044,0.003003,0.059799,0.007606,-0.020565,0.032908,-0.027515,...,0.011755,0.029449,-0.011121,0.084332,0.031011,-0.034971,0.01247,-0.028939,0.046105,7776743
0,-0.054652,0.078058,-0.055095,-0.041828,-0.006715,0.019636,-0.039402,0.015707,-0.003959,0.019515,...,0.053439,0.027172,-0.026234,0.051671,-0.019979,-0.011625,-0.018657,-0.035646,0.025895,679979


In [144]:
tt.head()

Unnamed: 0,0_Title,1_Title,2_Title,3_Title,4_Title,5_Title,6_Title,7_Title,8_Title,9_Title,...,291_Title,292_Title,293_Title,294_Title,295_Title,296_Title,297_Title,298_Title,299_Title,id
0,-0.065509,0.103647,-0.035116,-0.032617,-0.075866,0.025417,-0.054854,0.02088,0.068233,-0.022143,...,0.03255,0.006832,-0.035522,0.07696,0.012868,-0.033474,0.014736,-0.055689,-0.018701,8844866
0,-0.07235,0.148529,-0.025573,0.024086,-0.033714,0.041064,-0.040374,0.017315,-0.018273,0.017804,...,0.069499,0.008832,-0.038142,0.103691,0.055951,-0.002821,0.000312,0.005471,-0.022646,2659366
0,-0.066269,0.116832,-0.078549,-0.00029,-0.058342,0.018301,-0.042047,-0.020272,-0.002478,0.040167,...,0.023469,0.069197,-0.04151,0.042437,0.078289,-0.033847,0.0105,-0.080802,0.011414,10940049
0,0.004414,0.114957,-0.047366,-0.001067,-0.020975,0.029123,-0.010478,0.007674,-0.014058,0.028262,...,0.038866,0.062495,-0.007892,0.1014,0.088833,-0.005941,-0.007884,0.004912,0.022956,7776743
0,-0.053857,0.123921,-0.050572,-0.027107,-0.054316,0.014609,-0.058074,-0.003075,0.004396,0.042438,...,0.062737,0.041506,-0.042645,0.079617,0.052966,0.005897,-0.002458,-0.034936,0.02533,679979


In [166]:
full_data=tt.merge(tb,how='left',left_on='id',right_on='id')
print(full_data.shape)
full_data=full_data.merge(target,how='left',left_on='id',right_on='id')
print(full_data.shape)

(7, 601)
(7, 602)


In [168]:
full_data.head()

Unnamed: 0,0_Title,1_Title,2_Title,3_Title,4_Title,5_Title,6_Title,7_Title,8_Title,9_Title,...,291_BodyMarkdown,292_BodyMarkdown,293_BodyMarkdown,294_BodyMarkdown,295_BodyMarkdown,296_BodyMarkdown,297_BodyMarkdown,298_BodyMarkdown,299_BodyMarkdown,target
0,-0.065509,0.103647,-0.035116,-0.032617,-0.075866,0.025417,-0.054854,0.02088,0.068233,-0.022143,...,0.011671,0.00709,-0.123104,0.063124,0.013623,-0.011971,-0.023992,-0.087915,-0.033759,open
1,-0.07235,0.148529,-0.025573,0.024086,-0.033714,0.041064,-0.040374,0.017315,-0.018273,0.017804,...,0.06034,-0.01793,-0.062969,0.022482,0.038964,0.009898,-0.00185,-0.022312,-0.021957,open
2,-0.066269,0.116832,-0.078549,-0.00029,-0.058342,0.018301,-0.042047,-0.020272,-0.002478,0.040167,...,-0.028228,0.048713,-0.047364,-0.011032,0.004505,0.007254,-0.006658,-0.04266,0.009221,open
3,0.004414,0.114957,-0.047366,-0.001067,-0.020975,0.029123,-0.010478,0.007674,-0.014058,0.028262,...,0.011755,0.029449,-0.011121,0.084332,0.031011,-0.034971,0.01247,-0.028939,0.046105,open
4,-0.053857,0.123921,-0.050572,-0.027107,-0.054316,0.014609,-0.058074,-0.003075,0.004396,0.042438,...,0.053439,0.027172,-0.026234,0.051671,-0.019979,-0.011625,-0.018657,-0.035646,0.025895,open


In [167]:
full_data.id.values

array([ 8844866,  2659366, 10940049,  7776743,   679979, 11465224,
          47862])

In [None]:
if __name__=="__main__":

    t0_total =time.time()

    # Read embeddings
    t0=time.time()
    print("Loading embeddings")
    embeddings = load_vectors("../input/crawl-300d-2M.vec")  
    
    t1 =time.time()
    total_time=t1-t0
    print("time to load", total_time)
    print(" ")
    
    t0=time.time()

    # Read trainig data
    df= pd.read_hdf('../input/tiny_data/train_tiny.h5',key='dataset')
    df = df[["PostId","Title","BodyMarkdown","OpenStatus"]]
    y = df.OpenStatus.values

    t1 =time.time()
    total_time=t1-t0
    print("time to read", total_time)
    print(" ")

    # create a new column called fold and fill it with -1
    df["kfold"] = -1

    t0=time.time()
    print("creating sentence embedding")

    # First column
    vector1 =[]

    print("the BodyMarkDown column embedding")
    
    vector1 = get_vectors(vector1,df.BodyMarkdown.values, embeddings)
    
    vector1 = np.array(vector1)    
    
    print("1st column shape",vector1.shape)

    # save vector1 and target
    save("../input/tiny_data/vector1.npy",vector1)  
    
    save("../input/tiny_data/target.npy",y)

    # second column

    del vector1
    vector1 =[]
    t1 =time.time()
    total_time=t1-t0
    print("time to BodyMarkDown", total_time)
    print(" ")
    print("the Title column embedding")
    t0=time.time()
    vector1 = get_vectors(vector1,df.Title.values,embeddings)
    vector1 = np.array(vector1)    
    print("2nd column shape",vector1.shape)
    save("../input/tiny_data/vector2.npy",vector1)
    t1 =time.time()
    total_time=t1-t0
    print("time to Title", total_time)
    print(" ")

    t1_total =time.time()
    print(f"total time for the process {df.shape},{t1_total-t0_total}")

