# Extracting features from categorical variables

In [1]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
onehot_encoder = DictVectorizer()
X = [
    {'city':'New York'},
    {'city':'San Francisco'},
    {'city':'Chapel Hill'},
    {'city':'Manipal'}
]

print(onehot_encoder.fit_transform(X).toarray())

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


# Standardizing features

In [6]:
from sklearn import preprocessing
import numpy as np

In [11]:
X = np.array([
    [0,0,5,13,9,1],
    [0,0,13,15,10,15],
    [0,3,15,2,0,11]
])

print(preprocessing.scale(X))

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]




# Extracting features from text

In [12]:
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [20]:
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [21]:
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus).todense()
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0], X[1]))
print('Distance between 1st and 3rd documents:',euclidean_distances(X[0], X[2]))
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [23]:
#Stop word filtering
vectorizer = CountVectorizer(stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


In [24]:
# Stemming and lemmatization
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]
vectorizer = CountVectorizer(binary = True,stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


In [25]:
corpus = [
    'I am gatheringg ingredients for sandwich.',
    'There were many wizards at the gathering.'
]

In [32]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering','v'))
print(lemmatizer.lemmatize('gathering','n'))

gather
gathering


In [33]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


In [38]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag 

wordnet_tags = ['n','v']
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]
stemmer = PorterStemmer()
print('Stemmed:',[[stemmer.stem(token) for token in word_tokenize(document)]for document in corpus])

def lemmatize(token,tag):
    if tag[0].lower() in ['n','v']:
        return lemmatizer.lemmatize(token,tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
print('Lemmatized:',[[lemmatize(token,tag) for token, tag in document] for document in tagged_corpus])

Stemmed: [['He', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


In [42]:
#Extending bag-of-words with tf-idf vectorizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['The dog ate a sandwich, the wizard transfigured  a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words = 'english')
frequencies = np.array(vectorizer.fit_transform(corpus).todense())[0]
print(frequencies)

[2 1 3 1 1]


In [44]:
print('Token indices %s' % vectorizer.vocabulary_)
for token, index in vectorizer.vocabulary_.items():
    print('The token "%s" appears %s times' % (token,frequencies[index]))

Token indices {'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}
The token "dog" appears 1 times
The token "ate" appears 2 times
The token "sandwich" appears 3 times
The token "wizard" appears 1 times
The token "transfigured" appears 1 times


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]


In [48]:
#Space-efficient feature vectorizing with the hashing trick
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the','ate','bacon','cat']
vectorizer = HashingVectorizer(n_features = 6)
print(vectorizer.transform(corpus).todense())

[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]
