# Scikit learn

In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv("../Data/advertising.csv", index_col=0)
print(f"Number of features {df.shape[1]-1}")
print(f"Number of samples {df.shape[0]}")

Number of features 3
Number of samples 200


In [24]:
X, y = df.drop("sales", axis = 1), df["sales"]
X.shape, y.shape

((200, 3), (200,))

## Scikit-learn "recipe"

**Steps**

1. train|test split or train|validation|test split
2. Scale dataset
- Many algorithms require scaling, some don't
- which type of scaling method to use?
- Scale training data, scale test data using training data
3. Fit algotihm to training data
4. Predict on test data
5. Evaluation metrics on test data

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((134, 3), (66, 3), (134,), (66,))

### Feature scaling

Normalization (MinMaxScaling)

$$X' = \frac{X-X_{min}}{X_{max}-X_{min}}$$

Compute min and max from training data and use on training and test data

In [26]:
from sklearn.preprocessing import MinMaxScaler

# instantiate an objet from the class MinMaxScaler()
scaler = MinMaxScaler()
print(type(scaler))

# Do scaler.fit on X_train - NOT on x_test
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"Min value in X_trian: {scaled_X_train.min()}")
print(f"Max value in X_train: {scaled_X_train.max()}")

print(f"Min value in X_test: {scaled_X_test.min()}")
print(f"Max value in X_test: {scaled_X_test.max()}")

    

<class 'sklearn.preprocessing._data.MinMaxScaler'>
Min value in X_trian: 0.0
Max value in X_train: 1.0
Min value in X_test: 0.005964214711729622
Max value in X_test: 1.1302186878727631


## Algotithm - linear regression

In [27]:
from sklearn.linear_model import LinearRegression

# SVD - singular Value Decomposition that is used for calculating pseudoinverse in OLS normal equation

# instatiate an object 
model_SVD = LinearRegression()
model_SVD.fit(scaled_X_train, y_train)

# weights
print(f"Weights (beta_hats) {model_SVD.coef_}")
print(f"Intercept {model_SVD.intercept_}")

Weights (beta_hats) [13.20747617  9.75285112  0.61108329]
Intercept 2.7911595196243653


## Stochastic gradient descent (SGD)

In [55]:
from sklearn.linear_model import SGDRegressor

# Note that SGD requires features to be scaled
model_SGD = SGDRegressor(loss = "squared_error", learning_rate="invscaling", max_iter=100000)
model_SGD.fit(scaled_X_train, y_train)

print(f"Weights (beta_hats) {model_SGD.coef_}")
print(f"Intercepts {model_SGD.intercept_}")

Weights (beta_hats) [12.02530224  8.94615174  1.34870211]
Intercepts [3.62816]


## Manual test

In [54]:
# Sanity check
test_sample_features = scaled_X_test[0]
test_sample_target = y_test.values[0]



array([0.54988164, 0.63709677, 0.52286282])

In [57]:
X_test.iloc[0]

TV           163.3
radio         31.6
newspaper     52.9
Name: 96, dtype: float64