In [1]:
# https://www.nltk.org/install.html
# https://spacy.io/usage/models

import nltk
import nltk.data
import spacy
from nltk import SnowballStemmer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
!python -m spacy download es_core_news_sm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\efren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\efren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\efren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting es-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [2]:
spacy.load('es_core_news_sm')

<spacy.lang.es.Spanish at 0x2f951589340>

# Read Text

In [3]:
# text
text = "Hello, Everybody. Welcome to the NLP class, hope you enjoy it a lot. Remember to do all your activities and assigned homeworks. Have a nice day!!"

# texto
texto = "Hola a todos. Bienvenidos a la clase de LNP, espero lo disfruten mucho. Recuerden realizar todas sus actividades y tareas asignadas. Espero tengan un buen día!!"

# Tokens

In [4]:
# tokenize using sentences
sentence = nltk.sent_tokenize(text)
for sen in sentence:
  print(sen)

parrafo = nltk.sent_tokenize(texto)
for parr in parrafo:
  print(parr)


Hello, Everybody.
Welcome to the NLP class, hope you enjoy it a lot.
Remember to do all your activities and assigned homeworks.
Have a nice day!
!
Hola a todos.
Bienvenidos a la clase de LNP, espero lo disfruten mucho.
Recuerden realizar todas sus actividades y tareas asignadas.
Espero tengan un buen día!
!


In [5]:
# tokenize using words
word = nltk.word_tokenize(text)
for w in word:
  print(w)

palabra = nltk.word_tokenize(texto)
for p in palabra:
  print(p)

Hello
,
Everybody
.
Welcome
to
the
NLP
class
,
hope
you
enjoy
it
a
lot
.
Remember
to
do
all
your
activities
and
assigned
homeworks
.
Have
a
nice
day
!
!
Hola
a
todos
.
Bienvenidos
a
la
clase
de
LNP
,
espero
lo
disfruten
mucho
.
Recuerden
realizar
todas
sus
actividades
y
tareas
asignadas
.
Espero
tengan
un
buen
día
!
!


In [6]:
# tokenize using regulat expressions
word_regexp = nltk.regexp_tokenize(text, "[\w]+") # "[\w]+" read only words
for w in word_regexp:
  print(w)

Hello
Everybody
Welcome
to
the
NLP
class
hope
you
enjoy
it
a
lot
Remember
to
do
all
your
activities
and
assigned
homeworks
Have
a
nice
day


  word_regexp = nltk.regexp_tokenize(text, "[\w]+") # "[\w]+" read only words


In [7]:
## regular expressions for spanish, a good practice to keep in mind
palabra_regexp = nltk.regexp_tokenize(texto, "[\w]+") # "[\w]+" read only words
for p in palabra_regexp:
  print(p)

Hola
a
todos
Bienvenidos
a
la
clase
de
LNP
espero
lo
disfruten
mucho
Recuerden
realizar
todas
sus
actividades
y
tareas
asignadas
Espero
tengan
un
buen
día


  palabra_regexp = nltk.regexp_tokenize(texto, "[\w]+") # "[\w]+" read only words


In [8]:
tokens_palabras = nltk.word_tokenize(texto)
for w in tokens_palabras:
  print(w)

Hola
a
todos
.
Bienvenidos
a
la
clase
de
LNP
,
espero
lo
disfruten
mucho
.
Recuerden
realizar
todas
sus
actividades
y
tareas
asignadas
.
Espero
tengan
un
buen
día
!
!


# Stop Words

In [9]:
sw_english = set(nltk.corpus.stopwords.words('english'))
print(sw_english)

{'against', 'how', 'doesn', 'nor', "isn't", 'by', "wouldn't", 'y', 'shan', 'off', 'that', 'ours', 'than', 'had', 'been', 'being', "wasn't", 'an', 'their', 'have', "shouldn't", 'if', 'hadn', 'hers', 'don', "she's", 'where', 're', 'having', 'am', 'again', 'about', 'too', 'she', 'between', 'shouldn', 'i', 'has', 'own', 'once', "mightn't", 'after', 'my', 'more', 'no', 'ourselves', 'only', 'down', 'm', 'mightn', 'was', 'me', 'few', 'both', 'this', 've', 'he', 'can', 'just', 'very', 'or', 'why', 'we', 'while', 'now', 'they', 'aren', 'myself', 'its', 'do', 'are', 'will', 'above', 'through', 's', 'itself', 'such', 'be', 'when', 'some', 'each', "you'll", "needn't", "shan't", 'at', "didn't", 'you', 'isn', 'what', 'there', 'is', 'from', 'theirs', 'up', 'over', 'a', 'herself', 'for', 'couldn', 'these', 'other', 'those', 'it', "should've", "you'd", "that'll", "hadn't", 'did', 'until', 'before', 'her', 'who', 'during', 'in', "haven't", 'hasn', "weren't", 'didn', 'on', 'but', "hasn't", 'of', 'whom', 

In [10]:
sw_espanol = set(nltk.corpus.stopwords.words('spanish'))
print(sw_espanol)

{'antes', 'sobre', 'nos', 'otra', 'estaríais', 'estará', 'un', 'estuvieran', 'la', 'esta', 'estabais', 'fuésemos', 'tuvo', 'fueron', 'tuyo', 'fue', 'mías', 'tienes', 'estoy', 'se', 'estéis', 'será', 'nada', 'tú', 'me', 'estuvo', 'tuviese', 'esté', 'estén', 'habías', 'estuviste', 'os', 'estuvieses', 'tuyos', 'sintiendo', 'sentid', 'tendrías', 'estaréis', 'estadas', 'hayáis', 'hubiéramos', 'tuvieras', 'tengamos', 'hubierais', 'hubiese', 'tanto', 'ella', 'hubieses', 'sea', 'estemos', 'habían', 'nuestra', 'seremos', 'estuvieras', 'otros', 'estaríamos', 'hay', 'o', 'hubieron', 'fui', 'teníais', 'fueses', 'como', 'fuera', 'por', 'había', 'tenemos', 'muy', 'era', 'son', 'ha', 'durante', 'nosotros', 'lo', 'con', 'soy', 'estados', 'habida', 'tengáis', 'ante', 'sean', 'hubieras', 'tendrás', 'pero', 'hubiésemos', 'estuviese', 'hasta', 'algunos', 'una', 'estuviera', 'han', 'habríamos', 'tengas', 'estuviesen', 'cuando', 'fuese', 'vosotros', 'estaremos', 'has', 'nuestras', 'sí', 'algunas', 'tuvierai

In [11]:
my_list = {'a', 'b', 'c'}
my_list.update(['d', 'e'])
print(my_list)

{'d', 'c', 'b', 'e', 'a'}


In [12]:
for w in word:
  if w.lower() not in sw_english:
      print('Is not a SW =', w)

Is not a SW = Hello
Is not a SW = ,
Is not a SW = Everybody
Is not a SW = .
Is not a SW = Welcome
Is not a SW = NLP
Is not a SW = class
Is not a SW = ,
Is not a SW = hope
Is not a SW = enjoy
Is not a SW = lot
Is not a SW = .
Is not a SW = Remember
Is not a SW = activities
Is not a SW = assigned
Is not a SW = homeworks
Is not a SW = .
Is not a SW = nice
Is not a SW = day
Is not a SW = !
Is not a SW = !


In [13]:
for palabra in tokens_palabras:
  if palabra.lower() not in sw_espanol:
    print('No es SW =', palabra)

No es SW = Hola
No es SW = .
No es SW = Bienvenidos
No es SW = clase
No es SW = LNP
No es SW = ,
No es SW = espero
No es SW = disfruten
No es SW = .
No es SW = Recuerden
No es SW = realizar
No es SW = todas
No es SW = actividades
No es SW = tareas
No es SW = asignadas
No es SW = .
No es SW = Espero
No es SW = buen
No es SW = día
No es SW = !
No es SW = !
