In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# 1. Linear model

## $R^2$ (coefficient of determination) score function
URL https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

## $R^2 = 1 - \frac{\frac{1}{n} \sum_{i=1}^n (y_i - \widehat{y_i})^2}{\frac{1}{n} \sum_{i=1}^n (y_i - \bar{y_i})^2} = 1 - \frac{residual\_sum\_of\_square}{total\_sum\_of\_ squares} = 1 - \frac{RSS}{TSS}$

- $ùëÖ^2$ value of 100% means the model explains all the variation of the target variable. 
- And a value of 0% measures zero predictive power of the model. 
- **So, the higher the R-squared value, the better the model.**
- During the worst cases, R2 score can even be negative. There are cases where the computational definition of $ùëÖ^2$ can yield negative values, depending on the definition used. $ùëÖ^2$ is bounded above by 1.0, but it is not bounded below. The reason is the evaluation (score) on unseen data, which can lead to results outside <0.1>. If $R^2$ on the same data fitted to the model will produce a score within <0, 1>, but **don't do that**.

In [None]:
from sklearn.metrics import r2_score

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(r2_score(y_true, y_pred))

y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
print(r2_score(y_true, y_pred, multioutput='variance_weighted'))

y_true = [1, 2, 3]
y_pred = [1, 2, 3]
print(r2_score(y_true, y_pred))

y_true = [1, 2, 3]
y_pred = [2, 2, 2]
print(r2_score(y_true, y_pred))

y_true = [1, 2, 3]
y_pred = [3, 2, 1]
print(r2_score(y_true, y_pred))

## Diabetes linear modeling, MSE and $R^2$

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error (MSE)
print('Mean squared error: %.2f' % mean_squared_error(diabetes_y_test, diabetes_y_pred))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

# 2. Linear Regression

## 2.1 Simple Linear Regression with scikit-learn

URL https://realpython.com/linear-regression-in-python/

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([5, 20, 14, 32, 22, 38])

# modeling
model = LinearRegression()
model.fit(x, y)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

# scoring
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

**prediction**

In [None]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

**prediction on new data**

In [None]:
x_new = np.arange(5).reshape((-1, 1))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

**two-dimensional y**

In [None]:
model2 = LinearRegression().fit(x, y.reshape((-1, 1)))
print('intercept:', model2.intercept_)
print('slope:', model2.coef_)

## 2.2 Multiple Linear Regression with scikit-learn

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x = np.array(x)
y = np.array(y)
print(x)
print(y)

# modeling
model = LinearRegression().fit(x, y)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

# scoring
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

**prediction**

In [None]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

**prediction on new data**

In [None]:
x_new = np.arange(10).reshape((-1, 2))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

## 2.3 Polynomial Regression with scikit-learn

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([15, 11, 2, 8, 25, 32])
print(x)

# Transformer model 2nd degree
# transformer = PolynomialFeatures(degree=2, include_bias=False)
# transformer.fit(x)
# x_ = transformer.transform(x)

# the same code effect, but shorter
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)
print(x_)

# modeling
model = LinearRegression().fit(x_, y)
print('intercept:', model.intercept_)
print('coefficients:', model.coef_)

# Scoring
r_sq = model.score(x_, y)
print('coefficient of determination:', r_sq)

# prediction
y_pred = model.predict(x_)
print('predicted response:', y_pred, sep='\n')

### Polynomial Regression visualization

In [None]:
# Author: Mathieu Blondel
# Jake Vanderplaas
# License: BSD 3 clause
# URL https://scikit-learn.org/stable/auto_examples/linear_model/plot_polynomial_interpolation.html

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
plt.figure(figsize=(15,10))

# function to approximate by polynomial interpolation
def f(x):
    return x * np.sin(x)


# generate points used to plot
x_plot = np.linspace(0, 10, 100)

# generate points and keep a subset of them
x = np.linspace(0, 10, 100)
rng = np.random.RandomState(0)
rng.shuffle(x)
x = np.sort(x[:20])
y = f(x)

# create matrix versions of these arrays
X = x[:, np.newaxis]
X_plot = x_plot[:, np.newaxis]

colors = ['teal', 'yellowgreen', 'gold', 'red']
lw = 2
plt.plot(x_plot, 
         f(x_plot), 
         color='cornflowerblue', 
         linewidth=lw,
         label="ground truth")
plt.scatter(x, y, 
            color='navy', 
            s=30, 
            marker='o', 
            label="training points")

for count, degree in enumerate([3, 4, 5]):
    model = make_pipeline(PolynomialFeatures(degree), 
                          Ridge())
    model.fit(X, y)
    y_plot = model.predict(X_plot)
    plt.plot(x_plot, 
             y_plot, 
             color=colors[count], 
             linewidth=lw,
             label="degree %d" % degree)

plt.legend(loc='lower left')
plt.show()

## 2.4 Linear Regression with statsmodels

In [None]:
import numpy as np
import statsmodels.api as sm

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x = np.array(x)
y = np.array(y)
x = sm.add_constant(x)
print(x)
print(y)

# modeling
model = sm.OLS(y, x)

# scoring
results = model.fit()
print('coefficient of determination:', results.rsquared)
print('adjusted coefficient of determination:', results.rsquared_adj)
print('regression coefficients:', results.params)

# predict
# print('predicted response:', results.fittedvalues, sep='\n')
# print('predicted response:', results.predict(x), sep='\n')

# predict on new data
x_new = sm.add_constant(np.arange(10).reshape((-1, 2)))
print(x_new)
y_new = results.predict(x_new)
print(y_new)

# summary
# print(results.summary())
# print(results.summary2())

## Notes: Feature selection - Embedded

Combine the advantages of filters and wrappers
- The model that is being trained will directly choose the attributes that are best for it

Few models support it
* Linear models penalized by L1 (Lasso) or L1+L2 (Elastic Net) regularization: SVM, Linear regression, Logistic regression ...

- Regularization introduces into the model a penalty for the number / size of model attribute weights. It's not just a prediction error. Naturally, a simpler model is chosen.