![logo](logo/DaftCode_logo_854x210.jpg)

# Sci-kit learn: pierwsze kroki

In [1]:
import sklearn

In [2]:
print(sklearn.__doc__)


Machine learning module for Python

sklearn is a Python module integrating classical machine
learning algorithms in the tightly-knit world of scientific Python
packages (numpy, scipy, matplotlib).

It aims to provide simple and efficient solutions to learning problems
that are accessible to everybody and reusable in various contexts:
machine-learning as a versatile tool for science and engineering.

See http://scikit-learn.org for complete documentation.



In [3]:
sklearn.__version__

'0.18.1'

# Przykład zaczerpnięty z: http://scikit-learn.org/stable/auto_examples/missing_values.html

In [4]:
from sklearn.datasets import load_boston

In [5]:
dataset = sklearn.datasets.load_boston()

In [12]:
from sklearn.datasets.base import Bunch

In [None]:
import pandas as pd

DF_data = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [68]:
DF_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.593761,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.596783,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.647423,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [70]:
X_full, y_full = dataset.data, dataset.target

n_samples = X_full.shape[0]
n_features = X_full.shape[1]

estimator = LinearRegression()

score = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=10).mean()
print("Mean score = {}".format(score))

Mean score = -34.7630915054


In [71]:
from sklearn.ensemble import RandomForestRegressor

In [72]:
estimator = RandomForestRegressor(n_estimators=100)
score = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=10).mean()
print("Mean score = {}".format(score))

Mean score = -22.0575713262


In [79]:
import numpy as np

rng = np.random.RandomState(0)

missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]

estimator = RandomForestRegressor(n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered, scoring='neg_mean_squared_error', cv=10).mean()
print("Mean score = {}".format(score))



Mean score = -28.2297257686


In [76]:
from sklearn.preprocessing import Imputer

In [78]:
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()

imputer = Imputer(missing_values=0, strategy="mean", axis=0)
imputer.fit(X_missing)

X_filled_out = imputer.transform(X_missing)
estimator = RandomForestRegressor(n_estimators=100)
score = cross_val_score(estimator, X_filled_out, y_missing, scoring='neg_mean_squared_error', cv=10).mean()
print("Mean score = {}".format(score))

Mean score = -22.8833819907


In [None]:
estimator = Pipeline([("imputer", Imputer(missing_values=0, strategy="mean", axis=0)),
                      ("forest", RandomForestRegressor(n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Mean score = {}".format(score))