# Guessing the number: linear regression

## Using more variables

In [15]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
X, y = scale(boston.data), boston.target
print X.shape, y.shape

(506L, 13L) (506L,)


In [16]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression(normalize=True)
regression.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=True)

In [17]:
print regression.score(X,y)

0.740607742865


In [25]:
print [a+':'+str(round(b,1)) for a, b in zip(boston.feature_names, regression.coef_,)]

['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.9', 'LSTAT:-3.7']


In [26]:
print boston.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

## Understanding limitations and potential problems

# Moving to Logistic Regression

## Applying logistic regression

In [89]:
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data[:-1,:], iris.target[:-1]

In [97]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X,y)
print 'Predicted class %s, real class %s' % (logistic.predict(iris.data[-1,:]),iris.target[-1])
print 'Probabilities for each class from 0 to 2: %s' % logistic.predict_proba(iris.data[-1,:])

Predicted class [2], real class 2
Probabilities for each class from 0 to 2: [[ 0.00168787  0.28720074  0.71111138]]


## Considering when classes are more than two

In [109]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data[:1700,:], digits.target[:1700]
tX, ty = digits.data[1700:,:], digits.target[1700:]

In [110]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
OVR = OneVsRestClassifier(logistic).fit(X,y)
OVO = OneVsOneClassifier(logistic).fit(X,y)
print 'One vs rest accuracy: %.3f' % OVR.score(tX,ty)
print 'One vs one accuracy: %.3f' % OVO.score(tX,ty)

One vs rest accuracy: 0.938
One vs one accuracy: 0.969




In [111]:
LR = LogisticRegression()
LR.fit(X,y)
print 'One vs rest accuracy: %.3f' % LR.score(tX,ty)

One vs rest accuracy: 0.938


# Making Things as Simple as Naïve Bayes

## Predicting text classifications

In [98]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [111]:
print 'number of posts in training: %i' % len(newsgroups_train.data)
D={word:True for post in newsgroups_train.data for word in post.split(' ')}
print 'number of distinct words in training: %i' % len(D)
print 'number of posts in test: %i' % len(newsgroups_test.data)

number of posts in training: 11314
number of distinct words in training: 300972
number of posts in test: 7532


In [42]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)

In [43]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
multinomial_hashing_trick = HashingVectorizer(stop_words='english', binary=False, norm=None, non_negative=True)
binary_hashing_trick = HashingVectorizer(stop_words='english', binary=True, norm=None, non_negative=True)

In [103]:
Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
from sklearn.metrics import accuracy_score
for m,h in [(Bernoulli,binary_hashing_trick), (Multinomial,multinomial_hashing_trick)]:
    print 'Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target, y_pred=m.predict(h.transform(newsgroups_test.data))))

Accuracy for BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True): 0.570
Accuracy for MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True): 0.651


In [51]:
from sklearn.datasets import load_boston
boston = load_boston()

In [95]:
from sklearn.naive_bayes import GaussianNB
Gaussian = GaussianNB()
y_ord = pd.cut(boston.target, bins=4, labels=False)
Gaussian.fit(boston.data,y_ord)
print np.corrcoef(Gaussian.predict(boston.data),boston.target)[0,1]

0.734907024299


# Exploring Lazy Learning with K-nearest Neighbors

## Predicting after observing neighbors

In [98]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
digits = load_digits()
pca = PCA(n_components=25)
pca.fit(digits.data[:1700,:])
X, y = pca.transform(digits.data[:1700,:]), digits.target[:1700]
tX, ty = pca.transform(digits.data[1700:,:]), digits.target[1700:]

In [99]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=5, p=2)
kNN.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [100]:
print 'Accuracy: %.3f' % kNN.score(tX,ty) 
print 'Prediction: %s actual: %s' % (kNN.predict(tX[:10,:]),ty[:10])

Accuracy: 0.990
Prediction: [5 6 5 0 9 8 9 8 4 1] actual: [5 6 5 0 9 8 9 8 4 1]


## Choosing wisely your k parameter

In [78]:
for k in [1,5,10,20,50,100,200]:
    kNN = KNeighborsClassifier(n_neighbors=k).fit(X,y)
    print 'for k=%3i accuracy is %.3f' % (k, kNN.score(tX,ty))

for k=  1 accuracy is 0.979
for k=  5 accuracy is 0.990
for k= 10 accuracy is 0.969
for k= 20 accuracy is 0.969
for k= 50 accuracy is 0.959
for k=100 accuracy is 0.959
for k=200 accuracy is 0.907
