In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('apple_products.csv')
df.shape

(250, 5)

In [3]:
df.sample(5)

Unnamed: 0,name,original_price,sale_price,discount,product_link
87,Adaptador Apple Lightning a Jack para Audífono...,S/ 110 un,S/ 59.99 un,-45%,https://www.plazavea.com.pe//adaptador-apple-l...
156,Cable de Carga Apple USB-C a USB-C Blanco 2 Me...,S/ 150 un,S/ 119 un,-20%,https://www.plazavea.com.pe//cable-de-carga-ap...
35,iPhone XR 64GB 3GB Negro,"S/ 4,736 un","S/ 2,673 un",-43%,https://www.plazavea.com.pe//iphone-xr-64gb-3g...
217,Adaptador APPLE USB-C de 20 Watts,S/ 195 un,S/ 179 un,-8%,https://www.plazavea.com.pe//adaptador-apple-u...
197,iPhone 12 6.1' 128GB 12MPx Morado,"S/ 4,999 un","S/ 4,399 un",-12%,https://www.plazavea.com.pe//iphone-12-6-1--12...


# NLTK: Natural Language Toolkit

In [4]:
import nltk

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### 1. Word Tokenization

In [6]:
from nltk import word_tokenize

In [8]:
text = 'Hola, cómo estas?'

In [9]:
text.split()

['Hola,', 'cómo', 'estas?']

In [10]:
word_tokenize(text)

['Hola', ',', 'cómo', 'estas', '?']

In [11]:
df.iloc[5]['name']

'Smartphone APPLE iPhone 11 Pro 5.8" 64GB 12MP +12MP+12MP'

In [12]:
word_tokenize(df.iloc[5]['name'])

['Smartphone',
 'APPLE',
 'iPhone',
 '11',
 'Pro',
 '5.8',
 "''",
 '64GB',
 '12MP',
 '+12MP+12MP']

In [13]:
from nltk import FreqDist

In [14]:
freq = FreqDist(word_tokenize(' '.join(df['name'])))

In [16]:
freq.most_common(30)

[('Apple', 154),
 ('iPhone', 115),
 ('12', 59),
 ('Blanco', 58),
 ('Cable', 55),
 ('Cargador', 52),
 ('Pro', 46),
 ('iPad', 42),
 ('Lightning', 40),
 ('USB', 39),
 ('para', 36),
 ('C', 35),
 ('64GB', 30),
 ("''", 29),
 ('Original', 29),
 ('Watch', 29),
 ('de', 29),
 ('a', 28),
 ('128GB', 26),
 ('+', 23),
 ('APPLE', 22),
 ('Max', 22),
 ('Iphone', 22),
 ('Case', 22),
 ('11', 21),
 ('1', 21),
 ('12MP', 20),
 ('Metro', 20),
 ('Adaptador', 20),
 ('Tipo', 20)]

### 2. Dígitos y signos de puntuación

Funciones

In [22]:
'maria123'.isalpha()

False

In [20]:
'123'.isdigit()

False

In [24]:
'5.8'.isalnum()

False

Regular expressions

In [25]:
import re

In [26]:
re.search('[a-zA-Z]', 'hola')

<re.Match object; span=(0, 1), match='h'>

In [27]:
re.search('[a-zA-Z]', '4....5')

Signos de puntuación

In [28]:
import string

In [30]:
type(string.punctuation)

str

In [36]:
punct = string.punctuation + '¡¿' 

In [39]:
df.sample(5)

Unnamed: 0,name,original_price,sale_price,discount,product_link,name_prep
5,"Smartphone APPLE iPhone 11 Pro 5.8"" 64GB 12MP ...","S/ 5,299 un","S/ 5,099 un",-3%,https://www.plazavea.com.pe//smartphone-apple-...,+12MP+12MP
136,Cable Datos iPhone Original Lightning Tipo C,S/ 75 un,S/ 55 un,-26%,https://www.plazavea.com.pe//cable-datos-iphon...,C
108,iPhone 12 Mini 128GB Green,"S/ 6,213 un","S/ 4,099 un",-34%,https://www.plazavea.com.pe//iphone-12-mini-12...,Green
230,"MacBook Pro 13.3"" Chip M1 16GB RAM 1TB SSD Spa...",0,"S/ 9,999 un",0,https://www.plazavea.com.pe//macbook-pro-13-3-...,2020
59,iPhone 12 Pro Max 128GB Azul,"S/ 7,800 un","S/ 5,699 un",-26%,https://www.plazavea.com.pe//iphone-12-pro-max...,Azul


In [45]:
name_prep = []
for name in df['name']:
  tokens = []
  name = name.replace('\'\'', '')
  for word in word_tokenize(name):   
    if not (word in punct): tokens.append(word)
  name_prep.append(' '.join(tokens))

df['name_prep'] = name_prep

### 3. Operaciones con strings

In [48]:
'Apple'.lower()

'apple'

In [49]:
'Apple'.upper()

'APPLE'

In [51]:
'productos-tec-123-p'.split('-')

['productos', 'tec', '123', 'p']

In [56]:
'    Apple.    '.strip()

'Apple.'

In [57]:
txt = 'M3sajeALaN4ción'

In [59]:
txt.replace('3', 'e').replace('4', 'a')

'MesajeALaNación'

In [64]:
nombre = 'Pickachu'
apellido = 'Sagasti'

In [61]:
nombre + ' ' + apellido

'Francisco Sagasti'

In [62]:
f'{nombre} {apellido}'

'Francisco Sagasti'

In [65]:
f'Hola {nombre}, te saluda...'

'Hola Pickachu, te saluda...'

In [75]:
df.sample(5)

Unnamed: 0,name,original_price,sale_price,discount,product_link,name_prep
15,Smartphone APPLE Iphone 12 6.1'' 4GB 128GB 12M...,4899.0,"S/ 4,699 un",-4%,https://www.plazavea.com.pe//smartphone-apple-...,smartphone apple iphone 12 6.1 4gb 128gb 12mp ...
169,Funda Clear Case MagSafe iPhone 12 Mini Transp...,99.0,S/ 79.99 un,-19%,https://www.plazavea.com.pe//funda-clear-case-...,funda clear case magsafe iphone 12 mini transp...
212,"iPad Pro 2020 11"" Wifi 512GB Space Gray",6299.0,"S/ 5,719 un",-9%,https://www.plazavea.com.pe//ipad-pro-2020-11-...,ipad pro 2020 11 '' wifi 512gb space gray
210,iPhone 11 Pro 64GB Gris,4800.0,"S/ 4,399 un",-8%,https://www.plazavea.com.pe//iphone-11-pro-64g...,iphone 11 pro 64gb gris
202,"Apple Macbook Air (Late 2020) 13.3"" Apple 256...",6689.0,"S/ 5,989 un",-10%,https://www.plazavea.com.pe//apple-macbook-air...,apple macbook air late 2020 13.3 '' apple 256g...


In [67]:
df['name_prep'] = [name.lower() for name in df['name_prep']]

In [74]:
df['original_price'] = [float(price.replace('S/', '').replace('un', '').replace(',', '').strip()) for price in df['original_price']]

In [46]:
freq2 = FreqDist(word_tokenize(' '.join(df['name_prep'])))

In [47]:
freq2.most_common(30) 

[('Apple', 154),
 ('iPhone', 115),
 ('12', 59),
 ('Blanco', 58),
 ('Cable', 55),
 ('Cargador', 52),
 ('Pro', 46),
 ('iPad', 42),
 ('Lightning', 40),
 ('USB', 39),
 ('para', 36),
 ('C', 35),
 ('64GB', 30),
 ('Original', 29),
 ('Watch', 29),
 ('de', 29),
 ('a', 28),
 ('128GB', 26),
 ('APPLE', 22),
 ('Max', 22),
 ('Iphone', 22),
 ('Case', 22),
 ('11', 21),
 ('1', 21),
 ('12MP', 20),
 ('Metro', 20),
 ('Adaptador', 20),
 ('Tipo', 20),
 ('Magsafe', 19),
 ('GPS', 18)]

### 4. Stopwords

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

### 5. Stemming and Lemmatization 

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#...

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wlm = WordNetLemmatizer()
#..

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish')