![img](https://www.kdnuggets.com/wp-content/uploads/text-preprocessing-framework-2.png)

In [None]:
! pip install contractions

In [None]:
! pip install inflect

In [86]:
import pandas as pd
import numpy as np
import nltk.data
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText
import contractions
import inflect
import re, unicodedata
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.stem import SnowballStemmer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

pd.set_option('max_colwidth', 250)

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
STOPWORDS = set(stopwords.words('spanish'))

In [4]:
data = pd.read_csv('../02_Comprension de Datos/Descripciones175k_20190311.csv', sep = ';')

In [5]:
data.head(2)

Unnamed: 0,id,text
0,1060651400131,Woman_Limited_El_Corte_Inglés Moda Mujer Abrigos Abrigo masculino con textura de mujer
1,1060651400180,Woman_Limited_El_Corte_Inglés Moda Mujer Abrigos Abrigo doble faz de mujer con cinturón a tono


In [6]:
bag_of_words = nltk.word_tokenize(data['text'].to_string())
print(bag_of_words[:10])
print(len(bag_of_words)) 
#126k productos = 2_063_845 palabras
#175k productos ) 2_838_097

['0', 'Woman_Limited_El_Corte_Inglés', 'Moda', 'Mujer', 'Abrigos', 'Abrigo', 'masculino', 'con', 'textura', 'de']
2838097


## Normalizacion

Con la normalización vamos a realizar una serie de tareas destinadas a poner todo el texto en un campo de juego nivelado: convertir todo el texto a la misma mayúscula o minúscula, eliminar la puntuación, convertir las cifras a sus equivalentes en palabras, etc. La normalización pone todas las palabras en pie de igualdad y permite que el procesamiento se realice de manera uniforme. Algunas de las técnicas que vamos a aplicar son:

- Borrar caracteres extraños
- Pasar todo el texto a minusculas
- Quitar simbolos de puntuacion ( . , &, !, ?, ¿, /, etc)
- Pasar numeros en digito a texto
- Quitar stopwords
- Stemming
- Lexematizar

**Importante:** Despues de la Nomralizacion trabajaremos a nivel de palabra o token en vez de a nivel de texto

In [7]:
a = 'aaaa sdfer ewkrnmwejrn qwemwe er'
print (a.strip().split())

['aaaa', 'sdfer', 'ewkrnmwejrn', 'qwemwe', 'er']


In [18]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in STOPWORDS:
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    #if pandas == True:
    words  = words.strip().split()
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    #if pandas == True:
    words = " ".join(words)
    return words


In [12]:
%%time

words = normalize(bag_of_words)
print(len(words))

2288058
Wall time: 11.6 s


In [13]:
print(words[:10])

['zero', 'woman_limited_el_corte_ingles', 'moda', 'mujer', 'abrigos', 'abrigo', 'masculino', 'textura', 'mujer', 'one']


### Remove duplicates

In [14]:
p = list(dict.fromkeys(words))
len(p)

228942

#### V1 - 162.327

#### V2 - 228942 

In [15]:
p[:10]

['zero',
 'woman_limited_el_corte_ingles',
 'moda',
 'mujer',
 'abrigos',
 'abrigo',
 'masculino',
 'textura',
 'one',
 'doble']

## Vectorize Sentences

- Initialize tokenizer with num_words = MAX_NB_WORDS (200K). i.e. The tokenizer will perform a word count, sorted by number of occurences in descending order and pick top N words, 200K in this case 
- Use tokenizer's texts_to_sequences method to convert text to array of integers.
- The arrays obtained from previous step might not be of uniform length, use pad_sequences method to obtain arrays with length equal to MAX_SEQUENCE_LENGTH (30)

In [19]:
data_copy = data.copy()

In [20]:
data_copy['text'] = data_copy['text'].apply(normalize)
data_copy.head()

Unnamed: 0,id,text
0,1060651400131,woman_limited_el_corte_ingles moda mujer abrigos abrigo masculino textura mujer
1,1060651400180,woman_limited_el_corte_ingles moda mujer abrigos abrigo doble faz mujer cinturon tono
2,1051056400107,woman_el_corte_ingles moda mujer abrigos abrigo largo antelina mujer woman corte ingles
3,1019350401147,lloyds moda mujer abrigos chaqueta termica mujer lloyds efecto cortavientos
4,1019353400229,lloyds moda mujer abrigos parka one hundred algodon mujer lloyds capucha


In [22]:
MAX_NB_WORDS = 200_000 #200k
MAX_SEQUENCE_LENGTH = 30

In [29]:
all_text = data_copy['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_copy['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

A word_index has a unique ID assigned to each word in the data. Example:

In [30]:
word_index = tokenizer.word_index
test_string = "ropa deporte abrigo raqueta bebe"
print("word\t\tid")
print("-" * 20)
for word in test_string.split():
    print("%s\t\t%s" % (word, word_index[word]))

word		id
--------------------
ropa		42
deporte		1451
abrigo		494
raqueta		1985
bebe		11


In [106]:
sentences = [word.split() for word in all_text.values]
sentences[:3]

[['woman_limited_el_corte_ingles',
  'moda',
  'mujer',
  'abrigos',
  'abrigo',
  'masculino',
  'textura',
  'mujer'],
 ['woman_limited_el_corte_ingles',
  'moda',
  'mujer',
  'abrigos',
  'abrigo',
  'doble',
  'faz',
  'mujer',
  'cinturon',
  'tono'],
 ['lloyds',
  'moda',
  'mujer',
  'abrigos',
  'chaqueta',
  'termica',
  'mujer',
  'lloyds',
  'efecto',
  'cortavientos']]

## Word2Vec


In [99]:
model = Word2Vec(sentences, workers = 3, min_count=2, window = 5, size = 156)

In [100]:
model.train(sentences, total_examples=len(sentences), epochs=20)

(27315517, 31966400)

In [105]:
wl = 'el_corte_ingles'
model.wv.most_similar (positive = wl)

[('room__el_corte_ingles', 0.637089192867279),
 ('jo__mr_joe', 0.6152732968330383),
 ('hogar', 0.6144698858261108),
 ('nuestro_mejor_precio__el_corte_ingles', 0.5881239175796509),
 ('gloria_ortiz', 0.5839834213256836),
 ('southern_cotton', 0.5773452520370483),
 ('vera_wang', 0.5635312795639038),
 ('mini_home__el_corte_ingles', 0.5372772812843323),
 ('folli_follie', 0.5093116164207458),
 ('hugo_boss', 0.5072426199913025)]

## FastText

In [102]:
model_ft = FastText(sentences, workers = 3, min_count=2, window = 5, size = 156 )

In [103]:
model_ft.wv.most_similar("El_Corte_Ingles")

[('singles', 0.9083899855613708),
 ('room__el_corte_ingles', 0.9079532027244568),
 ('woman_el_corte_ingles', 0.9072155356407166),
 ('el_corte_ingles', 0.9068277478218079),
 ('woman_plus_el_corte_ingles', 0.9064659476280212),
 ('diamonds_el_corte_ingles', 0.9032517671585083),
 ('basics_el_corte_ingles', 0.9020206928253174),
 ('mini_home__el_corte_ingles', 0.8954898715019226),
 ('woman_limited_el_corte_ingles', 0.8943397402763367),
 ('singlestep', 0.8869022130966187)]