Python For Data Science Cheat Sheet: Scikit-learn
----
Scikit-learn is an open source Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using a unified interface.

A Basic Example

In [21]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.631578947368421

Loading The Data
----
Your data needs to be numeric and stored as NumPy arrays or SciPy sparse matrices. Other types that are convertible to numeric arrays, such as Pandas DataFrame, are also acceptable.

In [22]:
import numpy as np
X = np.random.random((10,5))
y = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
X[X < 0.7] = 0

Preprocessing The Data
---
Standardization

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

Normalization

In [24]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

Binarization

In [25]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

Encoding Categorical Features

In [26]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)

Imputing Missing Values

In [27]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=0, strategy='mean')
X_train_imputed = imp.fit_transform(X_train)


Generating Polynomial Features

In [28]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
X_poly = poly.fit_transform(X)

Training And Test Data

In [None]:
from sklearn.model_selection import train_test_split

# Example placeholder arrays
X = "feature_matrix"
y = "target_labels"

# Train-test split example
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)


Create Your Model
----
----------------------------------------------------------

Supervised Learning Estimators
----

Linear Regression

In [33]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

Support Vector Machines (SVM)

In [34]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

KNN

In [36]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

Unsupervised Learning Estimators
----

Principal Component Analysis (PCA)

In [37]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

K Means

In [38]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

Model Fitting
----

Supervised learning

In [None]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

Unsupervised Learning

In [None]:
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

Prediction
----

Supervised Estimators

In [None]:
y_pred = svc.predict(np.random.random((2,5)))
y_pred = lr.predict(X_test)
y_pred = knn.predict_proba(X_test))

Unsupervised Estimators

In [None]:
y_pred = k_means.predict(X_test)