In [1]:
# Chapter 3: Feature extraction and preprocessing

# DictVectorizer class
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [
    {'city': 'New York'},
    {'city': 'San Fransisco'},
    {'city': 'Chapel Hill'}
]
print(onehot_encoder.fit_transform(instances).toarray())

[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]


In [2]:
# Bag-of-words

from sklearn.feature_extraction.text import CountVectorizer

# Basic corpus
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
]
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Compute distances
from sklearn.metrics.pairwise import euclidean_distances
counts = [v for v in vectorizer.fit_transform(corpus).todense()]
print('Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2]))

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'played': 6, 'in': 4, 'duke': 2, 'lost': 5, 'sandwich': 7, 'the': 8, 'game': 3, 'unc': 9, 'basketball': 1, 'ate': 0}
Distance between 1st and 2nd documents: [[ 2.44948974]]
Distance between 1st and 3rd documents: [[ 2.64575131]]
Distance between 2nd and 3rd documents: [[ 2.64575131]]


In [3]:
# Stop-words
# Bag-of-words

from sklearn.feature_extraction.text import CountVectorizer

# Basic corpus
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
]

vectorizer = CountVectorizer(stop_words='english') # stop-words on
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Compute distances
from sklearn.metrics.pairwise import euclidean_distances
counts = [v for v in vectorizer.fit_transform(corpus).todense()]
print('Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2]))

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'played': 5, 'duke': 2, 'lost': 4, 'sandwich': 6, 'game': 3, 'unc': 7, 'basketball': 1, 'ate': 0}
Distance between 1st and 2nd documents: [[ 2.]]
Distance between 1st and 3rd documents: [[ 2.44948974]]
Distance between 2nd and 3rd documents: [[ 2.44948974]]


In [4]:
# Stemming and lemmatization
from sklearn.feature_extraction.text import CountVectorizer

# Example without stemming nor lemmatization
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

vectorizer = CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

# Example with stemming and lemmatization
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering', 'v'))
print(lemmatizer.lemmatize('gathering', 'n'))

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))


[[1 0 0 1]
 [0 1 1 0]]
{'sandwiches': 3, 'sandwich': 2, 'ate': 0, 'eaten': 1}
gather
gathering
gather


In [7]:
# Lemmatization example
#import nltk
#nltk.download('all')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
wordnet_tags = ['n', 'v']
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])

def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in
corpus]
print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus]) 

Stemmed: [['He', 'ate', 'the', 'sandwich'], ['Everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


In [9]:
# Bag-of-words with TF-IDF weights
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]

vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())

[[2 1 3 1 1]]
{'transfigured': 3, 'dog': 1, 'sandwich': 2, 'ate': 0, 'wizard': 4}
[[ 0.75458397  0.37729199  0.53689271  0.          0.        ]
 [ 0.          0.          0.44943642  0.6316672   0.6316672 ]]


In [11]:
# Hashing trick
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print(vectorizer.fit_transform(corpus).todense())

[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]


In [15]:
# Computer vision example
from sklearn import datasets
digits = datasets.load_digits()
print('Digit: ', digits.target[0])
print(digits.images[0])
print('Feature vector:\n', digits.images[0].reshape(-1, 64))

Digit:  0
[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]
Feature vector:
 [[  0.   0.   5.  13.   9.   1.   0.   0.   0.   0.  13.  15.  10.  15.
    5.   0.   0.   3.  15.   2.   0.  11.   8.   0.   0.   4.  12.   0.
    0.   8.   8.   0.   0.   5.   8.   0.   0.   9.   8.   0.   0.   4.
   11.   0.   1.  12.   7.   0.   0.   2.  14.   5.  10.  12.   0.   0.
    0.   0.   6.  13.  10.   0.   0.   0.]]


In [17]:
# Extracting points of interest as features
import numpy as np
from skimage.feature import corner_harris, corner_peaks
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
import skimage.io as io
from skimage.exposure import equalize_hist

def show_corners(corners, image):
    fig = plt.figure()
    plt.gray()
    plt.imshow(image)
    y_corner, x_corner = zip(*corners)
    plt.plot(x_corner, y_corner, 'or')
    plt.xlim(0, image.shape[1])
    plt.ylim(image.shape[0], 0)
    fig.set_size_inches(np.array(fig.get_size_inches()) * 1.5)
    plt.show()
    
####### TO BE COMPLETED #######

In [19]:
# Data standardization example
from sklearn import preprocessing
import numpy as np
X = np.array([
    [0., 0., 5., 13., 9., 1.],
    [0., 0., 13., 15., 10., 15.],
    [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]
