### Here we use the fasttext embeddings and convert our sentences into 300 dimensional vectors

In [1]:
# create folds.py
# import pandas and model_selection module from scikit-learn
import numpy as np
import pandas as pd
from numpy import save
from numpy import load

from sklearn import model_selection

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')

import re, string
import io
import sys

import time

#punctuation = list(string.punctuation)

my_stopwords = nltk.corpus.stopwords.words('english')# punctuation

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet =str(tweet)
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def tokenize(s):
    s=str(s)
    output = re.sub(r'\d+', '', s) # remove numbers 
    output = remove_links(output)
    result = re_tok.sub(r' \1 ', output).split() 
    result = [word for word in result if len(word)>2]
    result = [word for word in result if word not in my_stopwords]
    return result

# convert sentences to vectors from embedding, embedding is 300 dimensional
def sentence_to_vec(s,embedding_dict,stop_words,tokenizer):
    """
    s: sentence, string
    embedding_dict: dictionary word: vector
    stop_words: list of stop words
    tokenizer: tokenizer function
    """
    # convert sentence to string and lowercase it
    # words = str(s).lower()
    # tokenize the sentence
    words = s
    words = tokenizer(words)
    # remove stop words
    # words =[w for w in words if not w in stop_words]
    
    # keep only alpha numeric tokens
    words =[w for w in words if w.isalpha()]
    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for every word, get the embedding from the dictionary
        # and append to the list of embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])
    # if we don't have any vectors return zeros
    if len(M)==0:
        return np.zeros(300)
    # convert list of embeddings to array
    M = np.array(M)
    # calculate sum over axis=0
    v = M.sum(axis=0)
    return v/np.sqrt((v**2).sum())       


def load_vectors(fname):
    fin = io.open(
        fname,'r',encoding ='utf-8',
        newline = '\n',
        errors='ignore'
        )
    n,d = map(int,fin.readline().split())
    data ={}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]]=list(map(float,tokens[1:]))
    return data    


#create sentence embeddings
def get_vectors(vectors,column,embeddings):
    """
    vectors: empty array to fill in
    column: dataframe column
    return embedding vestors
    """
    for body in column:
        vectors.append(
            sentence_to_vec(s= body,
                           embedding_dict = embeddings,
                           stop_words =my_stopwords,
                           tokenizer=tokenize
                           )
        )
    return vectors   

### Load embeddings

In [2]:
t0=time.time()
print("Loading embeddings")
embeddings = load_vectors("../input/crawl-300d-2M.vec")  
t1 =time.time()
total_time=t1-t0

print("time to load", total_time)
print(" ")
t0=time.time()

# Read trainig data
df = pd.read_csv("../input/train_tiny/train.csv")
df = df[["Title","BodyMarkdown","OpenStatus"]]
y = df.OpenStatus.values


t1 =time.time()
total_time=t1-t0
print("time to read", total_time)
print(" ")

# create a new column called fold and fill it with -1
df["kfold"] = -1

t0=time.time()
print("creating sentence embedding")

vector1 =[]

print("the BodyMarkDown column embedding")
vector1 = get_vectors(vector1,df.BodyMarkdown.values, embeddings)
vector1 = np.array(vector1)    
print("1st column shape",vector1.shape)

# save vector1 and target
save("../input/train_tiny/vector1.npy",vector1)  
save("../input/train_tiny/target.npy",y)

del vector1
vector1 =[]
t1 =time.time()
total_time=t1-t0
print("time to BodyMarkDown", total_time)
print(" ")

print("the Title column embedding")
t0=time.time()
vector1 = get_vectors(vector1,df.Title.values,embeddings)
vector1 = np.array(vector1)    
print("2nd column shape",vector1.shape)
save("../input/train_tiny/vector2.npy",vector1)
t1 =time.time()
total_time=t1-t0
print("time to Title", total_time)
print(" ")


local_vars = list(locals().items())
for var, obj in local_vars:
    if var=='embeddings' or var =='vector1':
        print(var, sys.getsizeof(obj))
        
# clear up memory
print("cleaned ")
del embeddings

local_vars = list(locals().items())
for var, obj in local_vars:
    if var=='embeddings' or var =='vector1':
        print(var, sys.getsizeof(obj))

Loading embeddings
time to load 93.55019402503967
 
time to read 30.620375394821167
 
creating sentence embedding
the BodyMarkDown column embedding
1st column shape (3370528, 300)
time to BodyMarkDown 2627.3974480628967
 
the Title column embedding
2nd column shape (3370528, 300)
time to Title 272.0837643146515
 
embeddings 83886168
vector1 8089267312
cleaned 
vector1 8089267312
