# 1. Supervised Learning with scikit-learn

## (1) Classification

In [4]:
import warnings; warnings.filterwarnings('ignore')
import sklearn
sklearn.__version__

# sklearn.neighbors.classification was renamed to sklearn.neighbors._classification in version 0.22.X
# Downgrade to scikit-learn version <= 0.21.3 to fix this problem
# (https://github.com/ageitgey/face_recognition/issues/1262)

# pip install --user --upgrade scikit-learn==0.21.3

'0.21.3'

### **Classifying labels of unseen data**
1. Build a model
2. Model learns from the labeled data we pass to it (Labeled data = training data)
3. pass unlabeled data to the model as input
4. Model predicts the labels of the unseen data

### k-Nearest Neighbors
* Predict the label of a data point by
  * Looking at the **k** closest labeled data points
  * Taking a majority vote

In [6]:
# KNN 사용법
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

X = churn_df[['total_day_charge', 'total_eve_charge']].values
y = churn_df['churn'].values
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X, y)

X_new = np.array([[56.8, 17.5],
                 [24.4,24.1],
                 [50.1, 10.9]])
print(X_new.shape)

predictions = knn.predict(X_new)
print(f'Predictions: {predictions}')

### Measuring model performance

* How do we measure accuracy?
* Could compute accuracy on the data used to fit the classifier
* Not indicative of ability to generalize
* Split data -> Training set / Test set

In [None]:
# Train/test split
from sklearn.medel_selection import train_test_split
X = churn_df.drop("churn", axis=1).values
y = churn_df["churn"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

* Model Complexity
  * Larger k = less complex model = can cause underfitting
  * Smaller k = more complex model = can lead to overfitting

In [None]:
# Model complexity and over/underfitting
import matplotlib.pyplot as plt

train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1, 26)

for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)
    
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)
    
plt.figure(figsize=(8, 6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")

## (2) Regression

### Introduction to regression

In [None]:
# Creating features

import numpy as np

# Create X from the radio column's values
X = sales_df['radio'].values

# Create y from the sales column's values
y = sales_df['sales'].values

# Reshape X
X = X.reshape(-1, 1)

# Check the shape of the features and targets
print(X.shape, y.shape)

In [None]:
# Building a linear regression model

# Import LinearRegression
from sklearn.linear_model import LinearRegression

# Create the model
reg = LinearRegression()

# Fit the model to the data
reg.fit(X, y)

# Make predictions
predictions = reg.predict(X)

print(predictions[:5])

In [None]:
# Visualizing a linear regression model

# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Create scatter plot
plt.scatter(X, y, color="blue")

# Create line plot
plt.plot(X, predictions, color="red")
plt.xlabel("Radio Expenditure ($)")
plt.ylabel("Sales ($)")

# Display the plot
plt.show()

In [None]:
# Fit and predict for regression
# Create X and y arrays
X = sales_df.drop("sales", axis=1).values
y = sales_df["sales"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the model
reg = LinearRegression()

# Fit the model to the data
reg.fit(X_train, y_train)

# Make predictions
y_pred = reg.predict(X_test)
print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))

In [None]:
# Regression preformance
# Import mean_squared_error
from sklearn.metrics import mean_squared_error

# Compute R-squared
r_squared = reg.score(X_test, y_test)

# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))

### Cross-validation

In [None]:
# Cross-validation for R-squared

# Import the necessary modules
from sklearn.model_selection import cross_val_score, KFold

# Create a KFold object
kf = KFold(n_splits=6, shuffle=True, random_state=5)

reg = LinearRegression()

# Compute 6-fold cross-validation scores
cv_scores = cross_val_score(reg, X, y, cv=kf)

# Print scores
print(cv_scores)

In [None]:
# Analyzing cross-validation metrics

# Print the mean
print(np.mean(cv_results))

# Print the standard deviation
print(np.std(cv_results))

# Print the 95% confidence interval
print(np.quantile(cv_results, [0.025, 0.975]))

### Regularized regression

* Ridge
  * Ridge regression performs regularization by computing the squared values of the model parameters multiplied by alpha and adding them to the loss function.

In [None]:
# Regularized regression: Ridge

# Import Ridge
from sklearn.linear_model import Ridge
alphas = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
ridge_scores = []
for alpha in alphas:
  
  # Create a Ridge regression model
    ridge = Ridge(alpha=alpha)
  
  # Fit the data
    ridge.fit(X_train, y_train)
  
  # Obtain R-squared
    score = ridge.score(X_test, y_test)
    ridge_scores.append(score)
print(ridge_scores)

* Lasso
  * Lasso can select important features of a dataset (because shrinks the coefficients of less important features to zero)

In [None]:
# Lasso regression for feature importance

# Import Lasso
from sklearn.linear_model import Lasso

# Instantiate a lasso regression model
lasso = Lasso(alpha=0.3)

# Fit the model to the data
lasso.fit(X, y)

# Compute and print the coefficients
lasso_coef = lasso.fit(X, y).coef_
print(lasso_coef)
plt.bar(sales_columns, lasso_coef)
plt.xticks(rotation=45)
plt.show()

## (3) Fine-Tuing Your Model