# Defining applications for data science

http://scikit-learn.org/stable/developers/<BR>
http://scikit-learn.org/stable/faq.html<BR>

In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data,boston.target
print X.shape, y.shape

(506L, 13L) (506L,)


In [2]:
from sklearn.linear_model import LinearRegression
hypothesis = LinearRegression(normalize=True)
hypothesis.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=True)

In [3]:
print hypothesis.coef_

[ -1.07170557e-01   4.63952195e-02   2.08602395e-02   2.68856140e+00
  -1.77957587e+01   3.80475246e+00   7.51061703e-04  -1.47575880e+00
   3.05655038e-01  -1.23293463e-02  -9.53463555e-01   9.39251272e-03
  -5.25466633e-01]


In [4]:
import numpy as np
new_observation = np.array([1,0,1,0,0.5,7,59,6,3,200,20,350,4],dtype=float)
print hypothesis.predict(new_observation)

25.8972783977


In [5]:
hypothesis.score(X,y)

0.74060774286494291

In [6]:
#help(LinearRegression)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
print scaler.transform(new_observation)

[ 0.01116872  0.          0.01979472  0.          0.23662551  0.65893849
  0.57775489  0.44288845  0.08695652  0.02480916  0.78723404  0.88173887
  0.06263797]


# Performing the Hashing Trick

## Using hash functions

In [7]:
print hash('Python')
print abs(hash('Python')) % 1000

-539294296
296


## Demonstrating the hashing trick

In [8]:
string_1 = 'Python for data science'
string_2 = 'Python for machine learning'

def hashing_trick(input_string, vector_size=20):
    feature_vector = [0] * vector_size
    for word in input_string.split(' '):
        index = abs(hash(word)) % vector_size
        feature_vector[index] = 1
    return feature_vector

print hashing_trick(input_string='Python for data science', vector_size=20)
print hashing_trick(input_string='Python for machine learning', vector_size=20)

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]


In [9]:
from scipy.sparse import csc_matrix
print csc_matrix([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
  (0, 18)	1


In [10]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html
from sklearn.feature_extraction.text import HashingVectorizer
sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None)
hashed_text = sklearn_hashing_trick.transform(['Python for data science','Python for machine learning'])
hashed_text

<2x20 sparse matrix of type '<type 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
one_hot_enconder = CountVectorizer()
one_hot_enconded = one_hot_enconder.fit_transform(['Python for data science','Python for machine learning'])

In [12]:
print one_hot_enconder.vocabulary_

{u'machine': 3, u'learning': 2, u'for': 1, u'python': 4, u'science': 5, u'data': 0}


In [13]:
sklearn_hashing_trick.transform(['New text has arrived'])

<1x20 sparse matrix of type '<type 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [14]:
one_hot_enconder.fit_transform(['New text has arrived'])

<1x4 sparse matrix of type '<type 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

# Performance testing

In [15]:
%timeit l = [k for k in range(10**6)]

10 loops, best of 3: 96.1 ms per loop


In [16]:
%timeit -n 20 -r 5 l = [k for k in range(10**6)]

20 loops, best of 5: 95.7 ms per loop


In [17]:
%%timeit 
l = list()
for k in range(10**6):
    l.append(k)

1 loops, best of 3: 163 ms per loop


In [18]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None) 
one_hot_enconder = CountVectorizer()
texts = ['Python for data science','Python for machine learning']

In [19]:
%timeit one_hot_enconded = one_hot_enconder.fit_transform(texts)

1000 loops, best of 3: 1.25 ms per loop


In [20]:
%timeit  hashing = sklearn_hashing_trick.transform(texts)

10000 loops, best of 3: 154 µs per loop


In [21]:
import timeit
cumulative_time = timeit.timeit("hashing = sklearn_hashing_trick.transform(texts)", 
                                 "from __main__ import sklearn_hashing_trick, texts", 
                                 number=10000)
print cumulative_time / 10000.0

0.000155533324034


## Memory profiler

In [22]:
# Installation procedures from the command line:
# pip install psutil
# pip install memory_profiler

In [23]:
# Initialization from IPython (to be repeat at every IPython start)
%load_ext memory_profiler

In [24]:
hashing = sklearn_hashing_trick.transform(texts)
%memit dense_hashing = hashing.toarray()

peak memory: 81.66 MiB, increment: 0.11 MiB


In [25]:
%%writefile example_code.py
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
def comparison_test():
    sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None) 
    one_hot_enconder = CountVectorizer()
    texts = ['Python for data science','Python for machine learning']
    one_hot_enconded = one_hot_enconder.fit_transform(texts)
    hashing = sklearn_hashing_trick.transform(texts)

Overwriting example_code.py


In [26]:
from example_code import comparison_test
%mprun -f comparison_test comparison_test()

('',)


# Demonstrating multiprocessing techniques

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data,digits.target
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score

In [None]:
%timeit single_core_learning = cross_val_score(SVC(), X, y, cv=20, n_jobs=1)

In [None]:
%timeit multi_core_learning = cross_val_score(SVC(), X, y, cv=20, n_jobs=-1)