# 1. Supervised Learning with scikit-learn

## (1) Classification

In [4]:
import warnings; warnings.filterwarnings('ignore')
import sklearn
sklearn.__version__

# sklearn.neighbors.classification was renamed to sklearn.neighbors._classification in version 0.22.X
# Downgrade to scikit-learn version <= 0.21.3 to fix this problem
# (https://github.com/ageitgey/face_recognition/issues/1262)

# pip install --user --upgrade scikit-learn==0.21.3

'0.21.3'

### **Classifying labels of unseen data**
1. Build a model
2. Model learns from the labeled data we pass to it (Labeled data = training data)
3. pass unlabeled data to the model as input
4. Model predicts the labels of the unseen data

### k-Nearest Neighbors
* Predict the label of a data point by
  * Looking at the **k** closest labeled data points
  * Taking a majority vote

In [6]:
# KNN 사용법
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

X = churn_df[['total_day_charge', 'total_eve_charge']].values
y = churn_df['churn'].values
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X, y)

X_new = np.array([[56.8, 17.5],
                 [24.4,24.1],
                 [50.1, 10.9]])
print(X_new.shape)

predictions = knn.predict(X_new)
print(f'Predictions: {predictions}')

### Measuring model performance

* How do we measure accuracy?
* Could compute accuracy on the data used to fit the classifier
* Not indicative of ability to generalize
* Split data -> Training set / Test set

In [None]:
# Train/test split
from sklearn.medel_selection import train_test_split
X = churn_df.drop("churn", axis=1).values
y = churn_df["churn"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

* Model Complexity
  * Larger k = less complex model = can cause underfitting
  * Smaller k = more complex model = can lead to overfitting

In [None]:
# Model complexity and over/underfitting
import matplotlib.pyplot as plt

train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1, 26)

for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)
    
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)
    
plt.figure(figsize=(8, 6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")

## (2) Regression