In [2]:
# Chapter 3: Feature extraction and preprocessing

# DictVectorizer class
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [
    {'city': 'New York'},
    {'city': 'San Fransisco'},
    {'city': 'Chapel Hill'}
]
print(onehot_encoder.fit_transform(instances).toarray())

[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]


In [15]:
# Bag-of-words

from sklearn.feature_extraction.text import CountVectorizer

# Basic corpus
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
]
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Compute distances
from sklearn.metrics.pairwise import euclidean_distances
counts = [v for v in vectorizer.fit_transform(corpus).todense()]
print('Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2]))

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'basketball': 1, 'game': 3, 'ate': 0, 'sandwich': 7, 'the': 8, 'played': 6, 'duke': 2, 'in': 4, 'unc': 9, 'lost': 5}
Distance between 1st and 2nd documents: [[ 2.44948974]]
Distance between 1st and 3rd documents: [[ 2.64575131]]
Distance between 2nd and 3rd documents: [[ 2.64575131]]


In [16]:
# Stop-words
# Bag-of-words

from sklearn.feature_extraction.text import CountVectorizer

# Basic corpus
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
]

vectorizer = CountVectorizer(stop_words='english') # stop-words on
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Compute distances
from sklearn.metrics.pairwise import euclidean_distances
counts = [v for v in vectorizer.fit_transform(corpus).todense()]
print('Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2]))

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'basketball': 1, 'game': 3, 'ate': 0, 'sandwich': 6, 'played': 5, 'duke': 2, 'unc': 7, 'lost': 4}
Distance between 1st and 2nd documents: [[ 2.]]
Distance between 1st and 3rd documents: [[ 2.44948974]]
Distance between 2nd and 3rd documents: [[ 2.44948974]]


In [14]:
# Stemming and lemmatization
from sklearn.feature_extraction.text import CountVectorizer

# Example without stemming nor lemmatization
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

vectorizer = CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Example with stemming and lemmatization
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering', 'v'))
print(lemmatizer.lemmatize('gathering', 'n'))

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))


[[1 0 0 1]
 [0 1 1 0]]
{'sandwiches': 3, 'sandwich': 2, 'ate': 0, 'eaten': 1}
gather
gathering
gather


In [19]:
# Lemmatization example
#import nltk
#nltk.download('all')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
wordnet_tags = ['n', 'v']
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])

def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in
corpus]
print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus]) 


Stemmed: [['He', 'ate', 'the', 'sandwich'], ['Everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]
