### Here we use the fasttext embeddings and convert our sentences into 300 dimensional vectors

In [1]:
# create folds.py
# import pandas and model_selection module from scikit-learn
import numpy as np
import pandas as pd

from sklearn import model_selection

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')

import re, string
import io

import time

#punctuation = list(string.punctuation)

my_stopwords = nltk.corpus.stopwords.words('english')# punctuation

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def tokenize(s):
    output = re.sub(r'\d+', '', s) # remove numbers 
    output = remove_links(output)
    result = re_tok.sub(r' \1 ', output).split() 
    result = [word for word in result if len(word)>2]
    result = [word for word in result if word not in my_stopwords]
    return result

# convert sentences to vectors from embedding, embedding is 300 dimensional
def sentence_to_vec(s,embedding_dict,stop_words,tokenizer):
    """
    s: sentence, string
    embedding_dict: dictionary word: vector
    stop_words: list of stop words
    tokenizer: tokenizer function
    """
    # convert sentence to string and lowercase it
    # words = str(s).lower()
    # tokenize the sentence
    words = s
    words = tokenizer(words)
    # remove stop words
    # words =[w for w in words if not w in stop_words]
    
    # keep only alpha numeric tokens
    words =[w for w in words if w.isalpha()]
    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for every word, get the embedding from the dictionary
        # and append to the list of embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])
    # if we don't have any vectors return zeros
    if len(M)==0:
        return np.zeros(300)
    # convert list of embeddings to array
    M = np.array(M)
    # calculate sum over axis=0
    v = M.sum(axis=0)
    return v/np.sqrt((v**2).sum())       


def load_vectors(fname):
    fin = io.open(
        fname,'r',encoding ='utf-8',
        newline = '\n',
        errors='ignore'
        )
    n,d = map(int,fin.readline().split())
    data ={}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]]=list(map(float,tokens[1:]))
    return data    


#create sentence embeddings
def get_vectors(vectors,column,embeddings):
    """
    vectors: empty array to fill in
    column: dataframe column
    return embedding vestors
    """
    for body in column:
        vectors.append(
            sentence_to_vec(s= body,
                           embedding_dict = embeddings,
                           stop_words =my_stopwords,
                           tokenizer=tokenize
                           )
        )
    return vectors   


### Load embeddings

In [2]:
print("Loading embeddings")
embeddings = load_vectors("../input/crawl-300d-2M.vec")  
    
# Read trainig data
df = pd.read_csv("../input/train_tiny.csv")
df = df[["Title","BodyMarkdown","OpenStatus"]]

# create a new column called fold and fill it with -1
df["kfold"] = -1

t0=time.time()
print("creating sentence embedding")
vector1 =[]
vector2 =[]
vectors=[]
print("the BodyMarkDown column embedding")
vector1 = get_vectors(vector1,df.BodyMarkdown.values, embeddings)
vector1 = np.array(vector1)    
print("1st column shape",vector1.shape)
    
print("the Title column embedding")
vector2 = get_vectors(vector2,df.Title.values,embeddings)
vector2 = np.array(vector2)    
print("2nd column shape",vector2.shape)
    
vectors = np.concatenate((vector1,vector2),axis=1)
print("vectors shape", vectors.shape)
print("clean up")
# clear up memory
vector1 =0
vector2 =0
embeddings=0

Loading embeddings
creating sentence embedding
the BodyMarkDown column embedding
1st column shape (10112, 300)
the Title column embedding
2nd column shape (10112, 300)
vectors shape (10112, 600)
clean up


### Create a dataframe with kfold column

In [3]:
dataset = pd.DataFrame(data=vectors)
dataset["kfold"]=-1
print("Train column shape",vectors.shape)
dataset["target"]=df["OpenStatus"]
    
t1=time.time()
total_time = t1-t0
print("time embedding",total_time)
               
# Randomize the rows of data
dataset = dataset.sample(frac=1).reset_index(drop=True)
# fetch labels

Train column shape (10112, 600)
time embedding 12.888197660446167


In [4]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,592,593,594,595,596,597,598,599,kfold,target
0,-0.030117,0.079741,-0.050383,0.01384,-0.022336,0.021947,0.013439,-0.024548,-0.001428,0.008173,...,0.02446,-0.055709,0.019366,0.064444,-0.049266,-0.003446,0.031894,0.003222,-1,open
1,-0.085136,0.089537,-0.093383,0.035254,-0.020439,0.04343,0.025257,0.032859,-0.009814,-0.091184,...,0.020189,-0.044247,0.035233,-0.057533,0.010098,-0.020951,-0.03311,-0.047752,-1,open
2,-0.014082,0.075218,-0.07551,0.044907,0.008949,0.041168,-0.024563,-0.017974,-0.045072,-0.049093,...,0.01565,-0.011171,0.008453,0.073911,0.047225,-0.009997,0.003017,-0.020692,-1,open
3,-0.08447,0.099616,-0.020309,0.037161,0.046117,0.073468,0.069002,-0.052852,0.008283,-0.029121,...,-0.002327,-0.052714,0.100147,-0.005999,-0.008256,-0.073581,0.015348,0.016701,-1,open
4,-0.107833,0.042954,-0.0198,-0.066842,-0.05285,0.003682,0.05267,-0.025304,-0.027217,-0.028778,...,0.0429,-0.039086,-0.001311,0.060713,-0.05932,-0.050993,0.060624,-0.010577,-1,open


### Create the folds and save it

In [5]:
y = dataset.target.values

#initialize the kfold class from model_selection module

kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column

for f, (t_, v_) in enumerate(kf.split(X=dataset, y=y)):
    dataset.loc[v_, 'kfold'] = f
    # save the new csv with kfold column
    dataset.to_csv("../input/_embedded_train_tiny_folds.csv", index = False)    

### In training this is how we read the data

In [6]:
feature =dataset.drop(['kfold','target'],axis=1)

In [7]:
feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
0,-0.030117,0.079741,-0.050383,0.01384,-0.022336,0.021947,0.013439,-0.024548,-0.001428,0.008173,...,0.059593,0.060918,0.02446,-0.055709,0.019366,0.064444,-0.049266,-0.003446,0.031894,0.003222
1,-0.085136,0.089537,-0.093383,0.035254,-0.020439,0.04343,0.025257,0.032859,-0.009814,-0.091184,...,0.005027,0.022059,0.020189,-0.044247,0.035233,-0.057533,0.010098,-0.020951,-0.03311,-0.047752
2,-0.014082,0.075218,-0.07551,0.044907,0.008949,0.041168,-0.024563,-0.017974,-0.045072,-0.049093,...,-0.0001,0.009521,0.01565,-0.011171,0.008453,0.073911,0.047225,-0.009997,0.003017,-0.020692
3,-0.08447,0.099616,-0.020309,0.037161,0.046117,0.073468,0.069002,-0.052852,0.008283,-0.029121,...,0.02736,0.06986,-0.002327,-0.052714,0.100147,-0.005999,-0.008256,-0.073581,0.015348,0.016701
4,-0.107833,0.042954,-0.0198,-0.066842,-0.05285,0.003682,0.05267,-0.025304,-0.027217,-0.028778,...,0.037573,0.027545,0.0429,-0.039086,-0.001311,0.060713,-0.05932,-0.050993,0.060624,-0.010577
