## Text Vectorization

### Lemmatization

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer  = WordNetLemmatizer()

In [2]:
lemmatizer?

In [7]:
lemmatizer.lemmatize('heavens')

'heaven'

In [10]:
text = "All models are wrong, but some are useful."

tokens = word_tokenize(text.lower())

lemmas = [lemmatizer.lemmatize(token) for token in tokens]

In [11]:
print(lemmas)

['all', 'model', 'are', 'wrong', ',', 'but', 'some', 'are', 'useful', '.']


In [12]:
" ".join(lemmas)

'all model are wrong , but some are useful .'

In [14]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

doc = nlp(text.lower())

lemmas = [token.lemma_ for token in doc]

print(" ".join(lemmas))

ModuleNotFoundError: No module named 'spacy'

In [None]:
#lemmatization using spaCy
import pandas as pd
import random             # in order to select a random review
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data = pd.read_csv('/datasets/imdb_reviews_small.tsv', sep='\t')
corpus = data['review']

def lemmatize(text):

    # < write code here >
    text = nlp(text.lower())
    lemmas = [token.lemma_ for token in text]
    text_lemmas = " ".join(lemmas)
    return text_lemmas

# store the review index in the review_idx variable
# either as a random number or a fixed value, e.g. 2557 
#review_idx = random.randint(0, len(corpus)-1)
review_idx = 2557

review = corpus[review_idx]

print("The original text:", review)
print()
print("The lemmatized text:", lemmatize(review))

### Regular Expressions

In [15]:
import re

In [16]:
# pattern
# substitution — what each pattern match should be substituted with
# text — the text which the function scans for pattern matches
re.sub(pattern, substitution, text)

NameError: name 'pattern' is not defined

In [18]:
print("Hello!\n")

print(r"Hello!\n")

Hello!

Hello!\n


In [None]:
# a range of letters is indicated by a hyphen:
# a-z = abcdefghijklmnopqrstuvwxyz
r"[a-zA-Z]"

In [None]:
#find apostrophes as well
r"[a-zA-Z']"

In [19]:
# review text
text = """
I liked this show from the first episode I saw, which was the "Rhapsody in Blue" episode (for those that don't know what that is, the Zan going insane and becoming pau lvl 10 ep). Best visuals and special effects I've seen on a television series, nothing like it anywhere.
"""
re.sub(r"[^a-zA-Z']", " ", text)

" I liked this show from the first episode I saw  which was the  Rhapsody in Blue  episode  for those that don't know what that is  the Zan going insane and becoming pau lvl    ep   Best visuals and special effects I've seen on a television series  nothing like it anywhere  "

In [20]:
text = "            I   liked   this   show   "
text.split()

['I', 'liked', 'this', 'show']

In [21]:
" ".join(['I', 'liked', 'this', 'show'])

'I liked this show'

In [None]:
import random             # in order to select a random review
import pandas as pd

import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data = pd.read_csv('/datasets/imdb_reviews_small.tsv', sep='\t')
corpus = data['review']

def clear_text(text):
    
    # < write code here >
    text = re.sub(r"[^a-zA-Z']", " ", text)
    text = text.split()
    text = " ".join(text)
    return text

def lemmatize(text):

    doc = nlp(text.lower())
    
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)
        
    return ' '.join(lemmas)

# store the review index in the review_idx variable
# either as a random number or a fixed value, e.g. 2557 
review_idx = random.randint(0, len(corpus)-1)
# review_idx = 2557

review = corpus[review_idx]

print("The original text:", review)
print()
print("The lemmatized text:", lemmatize(clear_text(review)))

### Bag of Words

In [None]:
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

text = """For want of a nail the shoe was lost. For want of a shoe the horse was lost. For want of a horse the rider was lost."""

doc = nlp(text)

tokens = [token.lemma_ for token in doc if not token.is_punct]

bow = Counter(tokens)

vector = [bow[token] for token in sorted(bow)]

print(vector)

### N-Grams

In [25]:
import re

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [23]:
s = """
    Natural-language processing (NLP) is an area of
    computer science and artificial intelligence
    concerned with the interactions between computers
    and human (natural) languages.
"""

In [26]:
generate_ngrams(s, 5)

['\n natural language processing nlp',
 'natural language processing nlp is',
 'language processing nlp is an',
 'processing nlp is an area',
 'nlp is an area of\n',
 'is an area of\n computer',
 'an area of\n computer science',
 'area of\n computer science and',
 'of\n computer science and artificial',
 'computer science and artificial intelligence\n',
 'science and artificial intelligence\n concerned',
 'and artificial intelligence\n concerned with',
 'artificial intelligence\n concerned with the',
 'intelligence\n concerned with the interactions',
 'concerned with the interactions between',
 'with the interactions between computers\n',
 'the interactions between computers\n and',
 'interactions between computers\n and human',
 'between computers\n and human natural',
 'computers\n and human natural languages',
 'and human natural languages \n']

In [29]:
# Sample sentence
s = "one two three four five"

tokens = s.split(" ")
# tokens = ["one", "two", "three", "four", "five"]
print(tokens)

sequences = [tokens[i:] for i in range(3)]
# The above will generate sequences of tokens starting
# from different elements of the list of tokens.
# The parameter in the range() function controls
# how many sequences to generate.
#
# sequences = [
#   ['one', 'two', 'three', 'four', 'five'],
#   ['two', 'three', 'four', 'five'],
#   ['three', 'four', 'five']]
print(sequences)

bigrams = zip(*sequences)
print(list(bigrams))
# The zip function takes the sequences as a list of inputs
# (using the * operator, this is equivalent to
# zip(sequences[0], sequences[1], sequences[2]).
# Each tuple it returns will contain one element from
# each of the sequences.
# 
# To inspect the content of bigrams, try:
# print(list(bigrams))
# which will give the following:
#
# [
#   ('one', 'two', 'three'),
#   ('two', 'three', 'four'),
#   ('three', 'four', 'five')
# ]
#
# Note: even though the first sequence has 5 elements,
# zip will stop after returning 3 tuples, because the
# last sequence only has 3 elements. In other words,
# the zip function automatically handles the ending of
# the n-gram generation.

['one', 'two', 'three', 'four', 'five']
[['one', 'two', 'three', 'four', 'five'], ['two', 'three', 'four', 'five'], ['three', 'four', 'five']]
[('one', 'two', 'three'), ('two', 'three', 'four'), ('three', 'four', 'five')]


### Using NLTK

In [32]:
import re
from nltk.util import ngrams

s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 3))

In [33]:
print(output)

[('one', 'two', 'three'), ('two', 'three', 'four'), ('three', 'four', 'five')]


### Creating a bag of words

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
count_vect = CountVectorizer()

In [37]:
corpus = [
    'for want of a nail the shoe be lose',
    'for want of a shoe the horse be lose',
    'for want of a horse the rider be lose',
    'for want of a rider the message be lose',
    'for want of a message the battle be lose',
    'for want of a battle the kingdom be lose',
    'and all for the want of a horseshoe nail'
]

In [38]:
#The counter extracts unique words from the corpus and counts how many times they appear in each text of the corpus. 
#The counter doesn't count separate letters.
# bow = bag of words
bow = count_vect.fit_transform(corpus)

In [39]:
bow.shape
#16 unique words, 7 rows

(7, 16)

In [41]:
print(bow.toarray())

[[0 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1]
 [0 0 0 1 1 1 0 0 1 0 0 1 0 1 1 1]
 [0 0 0 1 1 1 0 0 1 0 0 1 1 0 1 1]
 [0 0 0 1 1 0 0 0 1 1 0 1 1 0 1 1]
 [0 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1]
 [0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1]
 [1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1]]


In [42]:
#The list of unique words in the bag is called a vocabulary. 
#It's stored in the counter and can be accessed by calling the get_feature_names() method:
count_vect.get_feature_names()

['all',
 'and',
 'battle',
 'be',
 'for',
 'horse',
 'horseshoe',
 'kingdom',
 'lose',
 'message',
 'nail',
 'of',
 'rider',
 'shoe',
 'the',
 'want']

In [43]:
#bigrams
count_vect = CountVectorizer(ngram_range=(2, 2))

In [46]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
#stopwords
from nltk.corpus import stopwords

In [48]:
stop_words = set(stopwords.words('english'))

In [50]:
count_vect = CountVectorizer(stop_words=stop_words)

In [54]:
bow2 = count_vect.fit_transform(corpus)


In [53]:
print(bow2.toarray())

[[0 0 0 0 1 0 1 0 1 1]
 [0 1 0 0 1 0 0 0 1 1]
 [0 1 0 0 1 0 0 1 0 1]
 [0 0 0 0 1 1 0 1 0 1]
 [1 0 0 0 1 1 0 0 0 1]
 [1 0 0 1 1 0 0 0 0 1]
 [0 0 1 0 0 0 1 0 0 1]]


In [55]:
count_vect.get_feature_names()

['battle',
 'horse',
 'horseshoe',
 'kingdom',
 'lose',
 'message',
 'nail',
 'rider',
 'shoe',
 'want']

In [None]:
import pandas as pd

# < write code here >
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

data = pd.read_csv('/datasets/imdb_reviews_small_lemm.tsv', sep='\t')
corpus = data['review_lemm']

# create a bag-of-words without checking for stop words
# < write code here >
count_vect = CountVectorizer()
bow = count_vect.fit_transform(corpus)
print("The BoW size with stop words:", bow.shape)

# create a bag-of-words with checking for stop words
# < write code here >
stop_words = set(stopwords.words('english'))
count_vect = CountVectorizer(stop_words=stop_words)
bow = count_vect.fit_transform(corpus)

print("The BoW size without stop words:", bow.shape)

In [None]:
import pandas as pd

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer


data = pd.read_csv('/datasets/imdb_reviews_small_lemm.tsv', sep='\t')
corpus = data['review_lemm']

# create an n-gram with n=2 and store it in the n_gram variable

# < write code here >
count_vect = CountVectorizer(ngram_range=(2, 2))
n_gram = count_vect.fit_transform(corpus)

print("The size of 2-gram:", n_gram.shape)

### TF-IDF in sklearn

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
stop_words = set(stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stop_words)

In [58]:
tf_idf = count_tf_idf.fit_transform(corpus)

In [None]:
import pandas as pd

from nltk.corpus import stopwords as nltk_stopwords

# import TfidfVectorizer
# < write code here >
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('/datasets/imdb_reviews_small_lemm.tsv', sep='\t')
corpus = data['review_lemm']

stop_words = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stop_words)

tf_idf = count_tf_idf.fit_transform(corpus)
# < write code here >

print("The TF-IDF matrix size:", tf_idf.shape)