# Word2Vec Modified Data
This file translates each word in the description and title column of the [Avito Demand Prediction Challenge](Avito Demand Prediction Challenge) training and test data into numpy arrays of word vectors. The embedding space for the vectors is 300 features. 

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from gensim.models import KeyedVectors
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
print(os.listdir("input"))

['avito-demand-prediction', 'fasttext-russian-2m']


In [2]:
trainDF = pd.read_csv("input/avito-demand-prediction/train.csv")
testDF = pd.read_csv("input/avito-demand-prediction/test.csv")

trainDF.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [3]:
ru_model = KeyedVectors.load_word2vec_format('input/fasttext-russian-2m/wiki.ru.vec')

In [14]:
max_features = len(ru_model.vocab)
max_len = 200
emeddingDim = 300
tokenizer = Tokenizer(num_words=max_features)

# Description 
tokenizer.fit_on_texts(trainDF["description"].dropna(axis=0, how='any'))

# Title
train_desc_tokens = pad_sequences(tokenizer.texts_to_sequences(trainDF["description"].dropna(axis=0, how='any'))
                                  ,maxlen=max_len)
test_desc_tokens =  pad_sequences(tokenizer.texts_to_sequences(testDF["description"]),maxlen=max_len)



train_title_tokens = pad_sequences(tokenizer.texts_to_sequences(trainDF["title"].dropna(axis=0, how='any')),maxlen=10)
test_title_tokens =  pad_sequences(tokenizer.texts_to_sequences(testDF["title"]),maxlen=10)

In [15]:
def getEmbedding(x):
    if x in ru_model:
        return ru_model[x]
    else:
        return None
    
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, emeddingDim))
for word, i in word_index.items():
    embedding_vector = getEmbedding(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
out = [(train_desc_tokens,test_desc_tokens),(train_title_tokens,test_title_tokens),embedding_matrix]

with open('desc_tit_embeddings.p', 'wb') as fp:
    pickle.dump(out, fp)

In [6]:
token_lookup = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

def tokenReplacer(x):
    ls = []
    for token in x:
        ls.append(tokenReplace(token))
        
    return np.array(ls)

def tokenReplace(z):
    if z == 0:
        return np.zeros(300)
    else:
        return ru_model[token_lookup[z]]


train_desc_tokens = np.apply_along_axis(tokenReplacer,1,train_desc_tokens )
#train_desc_tokens = tokensToVects(ru_model,tokenizer,train_desc_tokens)
#test_desc_tokens = tokensToVects(ru_model,tokenizer,test_desc_tokens)

np.save("train_desc_vects.npy",train_desc_tokens)
#np.save("train_desc_vects.npy",test_desc_tokens )

#train_title_tokens = tokensToVects(ru_model,tokenizer,train_title_tokens)
#test_title_tokens = tokensToVects(ru_model,tokenizer,test_title_tokens)

#np.save("train_title_vects.npy",train_title_tokens)
#np.save("train_title_vects.npy",test_title_tokens )

MemoryError: 

In [20]:
train_desc_tokens.shape

(1387148, 200)

In [None]:
def tokensToVects(vecModel,tokenizerObj,tokenLists,emeddingDim=300):
    token_lookup = dict(zip(tokenizerObj.word_index.values(), tokenizerObj.word_index.keys()))
    rowVects = []
    for tokenLs in tokenLists:
        wordVects = []
        for token in tokenLs: 
            if token == 0:
                wordVects.append(np.zeros(emeddingDim))
            else:
                tokenVal = token_lookup[token]
                if tokenVal in vecModel:
                    wordVects.append(vecModel[tokenVal])
        rowVects.append(np.array(wordVects))
        
    return np.array(rowVects)

In [13]:
tst = np.load("train_desc_vects.npy")

In [None]:
 print(x)
    ls = []
    for idx in range(200):
        print(x[idx])
        ls.append(tokenReplace(x[idx]))
    print("D")
    return ls