In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# 1. Linear model

## $R^2$ (coefficient of determination) score function
URL https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

## $R^2 = 1 - \frac{\frac{1}{n} \sum_{i=1}^n (y_i - \widehat{y_i})^2}{\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y_i})^2} = 1 - \frac{residual\_sum\_of\_square}{total\_sum\_of\_ squares} = 1 - \frac{RSS}{TSS}$

- $𝑅^2$ value of 100% means the model explains all the variation of the target variable. 
- And a value of 0% measures zero predictive power of the model. 
- **So, the higher the R-squared value, the better the model.**
- During the worse cases, R2 score can even be negative. There are cases where the computational definition of $𝑅^2$ can yield negative values, depending on the definition used. $𝑅^2$ is bounded above by 1.0, but it is not bounded below. The reason is the evaluation (score) on unseen data, which can lead to results outside <0,1>. If $R^2$ on the same data fitted to the model will produce a score within <0, 1>, but **don't do that**.

In [None]:
from sklearn.metrics import r2_score

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(r2_score(y_true, y_pred))

y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
print(r2_score(y_true, y_pred, multioutput='variance_weighted'))

y_true = [1, 2, 3]
y_pred = [1, 2, 3]
print(r2_score(y_true, y_pred))

y_true = [1, 2, 3]
y_pred = [2, 2, 2]
print(r2_score(y_true, y_pred))

y_true = [1, 2, 3]
y_pred = [3, 2, 1]
print(r2_score(y_true, y_pred))

## Diabetes linear modeling, MSE and $R^2$

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error (MSE)
print('Mean squared error: %.2f' % mean_squared_error(diabetes_y_test, diabetes_y_pred))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

# 2. Linear Regression

## 2.1 Simple Linear Regression with scikit-learn

URL https://realpython.com/linear-regression-in-python/

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([5, 20, 14, 32, 22, 38])

# modelling
model = LinearRegression()
model.fit(x, y)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

# scoring
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

**prediction**

In [None]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

**prediction on new data**

In [None]:
x_new = np.arange(5).reshape((-1, 1))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

**two-dimentional y**

In [None]:
model2 = LinearRegression().fit(x, y.reshape((-1, 1)))
print('intercept:', model2.intercept_)
print('slope:', model2.coef_)

## 2.2 Multiple Linear Regression with scikit-learn

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x = np.array(x)
y = np.array(y)
print(x)
print(y)

# modelling
model = LinearRegression().fit(x, y)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

# scoring
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

**prediction**

In [None]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

**prediction on new data**

In [None]:
x_new = np.arange(10).reshape((-1, 2))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

## 2.3 Polynomial Regression with scikit-learn

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([15, 11, 2, 8, 25, 32])
print(x)

# Transformer model 2. degree
# transformer = PolynomialFeatures(degree=2, include_bias=False)
# transformer.fit(x)
# x_ = transformer.transform(x)

# the same code effect, but shorter
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)
print(x_)

# modelling
model = LinearRegression().fit(x_, y)
print('intercept:', model.intercept_)
print('coefficients:', model.coef_)

# Scoring
r_sq = model.score(x_, y)
print('coefficient of determination:', r_sq)

# prediction
y_pred = model.predict(x_)
print('predicted response:', y_pred, sep='\n')

## 2.4 Linear Regression with statsmodels

In [None]:
import numpy as np
import statsmodels.api as sm

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x = np.array(x)
y = np.array(y)
x = sm.add_constant(x)
print(x)
print(y)

# modelling
model = sm.OLS(y, x)

# scoring
results = model.fit()
print('coefficient of determination:', results.rsquared)
print('adjusted coefficient of determination:', results.rsquared_adj)
print('regression coefficients:', results.params)

# predict
# print('predicted response:', results.fittedvalues, sep='\n')
# print('predicted response:', results.predict(x), sep='\n')

# predict on new data
x_new = sm.add_constant(np.arange(10).reshape((-1, 2)))
print(x_new)
y_new = results.predict(x_new)
print(y_new)

# summary
# print(results.summary())
# print(results.summary2())

# 3. Logistic Regression

URL https://realpython.com/logistic-regression-python/#classification

<!--
### $\hat{y}^{(i)}=\beta_{0}+\beta_{1}x^{(i)}_{1}+\ldots+\beta_{p}x^{(i)}_{p}$

### $ P(y^{(i)}=1)=\frac{1}{1+e^{-(\beta_{0}+\beta_{1}x^{(i)}_{1}+\ldots+\beta_{p}x^{(i)}_{p})}} $

### $ 𝑝(𝐱) = \frac{1}{1 + e^{−𝑓(𝐱)}} $

### $ 𝑓(𝐱) = log \left( \frac{𝑝(𝐱)}{1 − 𝑝(𝐱)} \right) $
//-->

## 3.1 scikit-learn: Logistic Regression
LogisticRegression(**C=1.0**, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
- 'liblinear' solver doesn’t work without regularization.
- 'newton-cg', 'sag', 'saga', and 'lbfgs' don’t support L1 regularization.
- 'saga' is the only solver that supports elastic-net regularization.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

def plot_confusion_matrix(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    print(conf_m)
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    return

def eval_model(model, x, y):
    p_pred = model.predict_proba(x)
    y_pred = model.predict(x)
    score_ = model.score(x, y)
    report = classification_report(y, y_pred)
    print(p_pred, '\n', y_pred, '\n', score_, '\n', report)
    return

In [None]:
# Get data
x = np.arange(10).reshape(-1, 1)
print(x)

# y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
y = np.array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1])
print(y)

In [None]:
# Create a model and train it
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x, y)

# Evaluate the model
eval_model(model, x, y)
# plot_confusion_matrix(y, y_pred)

**Hyperparameter tuning: set C=10.0 for better prediction? default C=1.0**

In [None]:
# Create a model and train it
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(x, y)

# Evaluate the model
eval_model(model, x, y)
# plot_confusion_matrix(y, y_pred)

## 3.2 StatsModels: Logistic Regression¶

In [None]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import classification_report

# Get data
x = np.arange(10).reshape(-1, 1)
y = np.array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1])
x = sm.add_constant(x)

# Create a model and train it
model = sm.Logit(y, x)
result = model.fit(method='newton')

# Evaluate the model
result.predict(x)
print(result.pred_table())

**Report with StatsModels**

In [None]:
result.summary()
# result.summary2()

**Report with scikit-learn**

In [None]:
# Classification + report with scikit-learn
y_pred = (result.predict(x) >= 0.5).astype(int)
report = classification_report(y, y_pred)
print(report)

# 4. Multiclass classification

## 4.1 One-vs-Rest

In [None]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
X = np.array([
    [10, 10],
    [8, 10],
    [-5, 5.5],
    [-5.4, 5.5],
    [-20, -20],
    [-15, -20]
])
y = np.array([0, 0, 1, 1, 2, 2])

clf = OneVsRestClassifier(SVC()).fit(X, y)
clf.predict([[-19, -20], [9, 9], [-5, 5]])

## 4.2 One-vs-One

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.33, 
    shuffle=True, 
    random_state=0)

clf = OneVsOneClassifier(
    LinearSVC(max_iter=10000, random_state=0)).fit(X_train, y_train)

clf.predict(X_test[:10])