In [None]:
#Classification using Scikit-Learn

In [13]:
import numpy as np

from bokeh.plotting import figure, show, save

from bokeh.io import output_notebook, output_file
import pandas as pd

In [14]:
output_notebook()

In [16]:
from sklearn.preprocessing import StandardScaler
from bokeh.charts import Scatter
from bokeh.layouts import column
from bokeh.models import Range1d

range_ = Range1d(-10, 10)

from sklearn.datasets import make_blobs
dims, labels = make_blobs(n_samples=100, n_features=2, centers=1, cluster_std=3, center_box=(2, 2), shuffle=True)

df_1 = pd.DataFrame.from_records(dims, columns=["first", "second"])
dims_scaler_1 = StandardScaler(copy=True, with_mean=True, with_std=False)

dims2 = dims_scaler_1.fit(dims).transform(dims)
df_2 = pd.DataFrame.from_records(dims2, columns=["first", "second"])
dims_scaler_2 = StandardScaler(copy=True, with_mean=True, with_std=True)
df_3 = pd.DataFrame.from_records(dims_scaler_2.fit(dims).transform(dims), columns=["first", "second"])
s1 = Scatter(df_1, x="first", y="second")
s1.y_range = range_
s1.x_range = range_
s2 = Scatter(df_2, x="first", y="second")
s2.y_range = range_
s2.x_range = range_
s3 = Scatter(df_3, x="first", y="second")
s3.y_range = range_
s3.x_range = range_
show(column(s1, s2, s3))


## Iris

In [17]:
from sklearn import datasets
iris = datasets.load_iris()

scaler = StandardScaler()
scaler = scaler.fit(iris.data)
iris.data = scaler.transform(iris.data)

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target)
clf = DummyClassifier()
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)


0.31578947368421051

In [18]:
from sklearn.svm import LinearSVC
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target)
clf = LinearSVC()
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)

0.97368421052631582

In [19]:
from sklearn.svm import SVC
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target)
clf = SVC()
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)

0.92105263157894735

### CROSS-VALIDATION


In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
iris = datasets.load_iris()

clf1 = SVC()
clf2 = KNeighborsClassifier()

score1 = cross_val_score(clf1, iris.data, iris.target, cv=3)
score2 = cross_val_score(clf2, iris.data, iris.target, cv=3)


In [27]:
print(score1)
print(score2)

[ 0.96        0.93333333]
[ 0.96  0.92]


## Cancer data

In [99]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target)
clf = DummyClassifier()
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)

0.49650349650349651

In [103]:
#example
X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target)
clf = LinearSVC(class_weight={1: 10, 0:1})
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)


0.90909090909090906

# 20 NEWSGROUPS

In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
#cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train')#, categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test')#, categories=cats)
vectorizer = TfidfVectorizer()


In [2]:
newsgroups_train.data[0]

'From: bil@okcforum.osrhe.edu (Bill Conner)\nSubject: Re: Not the Omni!\nNntp-Posting-Host: okcforum.osrhe.edu\nOrganization: Okcforum Unix Users Group\nX-Newsreader: TIN [version 1.1 PL6]\nLines: 18\n\nCharley Wingate (mangoe@cs.umd.edu) wrote:\n: \n: >> Please enlighten me.  How is omnipotence contradictory?\n: \n: >By definition, all that can occur in the universe is governed by the rules\n: >of nature. Thus god cannot break them. Anything that god does must be allowed\n: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts\n: >the rules of nature.\n: \n: Obviously, an omnipotent god can change the rules.\n\nWhen you say, "By definition", what exactly is being defined;\ncertainly not omnipotence. You seem to be saying that the "rules of\nnature" are pre-existant somehow, that they not only define nature but\nactually cause it. If that\'s what you mean I\'d like to hear your\nfurther thoughts on the question.\n\nBill\n'

In [8]:
vectorized = vectorizer.fit_transform(newsgroups_train.data)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.01)
classifier.fit(vectorized, newsgroups_train.target)
predicted = classifier.predict(vectorizer.transform(newsgroups_test.data))
#from sklearn import metrics
#metrics.
classifier.score(vectorizer.transform(newsgroups_test.data), newsgroups_test.target)

0.83523632501327671