In [None]:
from sklearn.datasets import load_iris
from matplotlib import style
import matplotlib.pyplot as plt
style.use("ggplot")

In [None]:
# save "bunch" object containing iris dataset and its attributes
iris = load_iris()
print(iris.keys())

X = iris.data
y = iris.target
test = [[3,5,4,2],[5,4,3,2]]
print (X.shape)
print (y.shape)

plt.scatter(X[:50,0],X[:50,1],marker = 'o',cmap='b',s=50,linewidth=1)
plt.scatter(X[50:100,0],X[50:100,1],marker = 'o',cmap='k',s=50,linewidth=1)
plt.scatter(X[100:150,0],X[100:150,1],marker = 'o',cmap='r',s=50,linewidth=1)
plt.show()

### 1. Evaluation Procedure: Train/Valid split
###### Note: Valid is usually called Test, but it doesn't really mean Test data in testing stage)
###### train_test_split( )
-> Provides a high-variance estimate of out-of-sample accuracy  
-> K-fold cross-validation overcomes this limiation  
-> train_test_split() is still used because of its flexibility and speed  
###### Validation accuracy: a better estimate than training accuracy of out-of-sample performance

In [None]:
from sklearn.model_selection import  train_test_split
# 40% for validation set & 60% for Training set
x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.4)
print(" X training: ", x_train.shape)
print(" X valid: ", x_valid.shape)
print(" Y training: ", y_train.shape)
print(" Y valid: ", y_valid.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
Logreg = LogisticRegression()
Logreg.fit(x_train,y_train)
y_labels = Logreg.predict(x_valid)
print(metrics.accuracy_score(y_valid,y_labels))

In [None]:
import pandas as pd

data = pd.read_csv("Dataset/Advertising.csv",index_col=0)
# Fetures: TV, Radio, Newspaper
# Responses: Sales

In [None]:
import seaborn as sns
# allow plots to appear within the notebook
%matplotlib inline
# showing relationships between Tv, Radio, Newspaper and Sales respectively
sns.pairplot(data, x_vars=['TV','Radio','Newspaper'],y_vars='Sales', size=7,kind='reg')

In [None]:
feature_cols = ['TV', 'Radio', 'Newspaper']
X = data[feature_cols]
y = data.Sales
x_train,x_valid, y_train,y_valid = train_test_split(X,y,random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x_train, y_train)
print(linreg.intercept_)
print(linreg.coef_)
for i in zip(feature_cols, linreg.coef_):
    print(i)

### 2. Model evaluation metrics for regression
Note: Evaluation metrics for Classification probelms, such as accuracy, are not useful for Regression problems!!!!!!  
-> Evaluation metrics should be designed for comparing contnuous values  
1. Mean Absolute Error (MAE): the mean of the absolute value of the errors $$\frac{1}{n}\sum_{i=1}^n{|{y_i-\hat{y}_i}|}$$  
2. Mean Square Error (MSE): the mean of the square value of the errors $$\frac{1}{n}\sum_{i=1}^n{(y_i-\hat{y}_i)}^2$$  
3. Root Mean Absolute Error (MAE): the mean of the square root value of teh errors $$\sqrt{\frac{1}{n}\sum_{i=1}^n{(y_i-\hat{y}_i)}^2}$$  
##### Comparison
-> MAE: the easiest to understand because it averages error  
-> MSE: more popular than MAE because MSE "punishes" larger errors  
-> RMSE: even more popular than MSE, because RMSE is interpretable in the y units

In [None]:
y_labels = linreg.predict(x_valid)
print(metrics.mean_squared_error(y_labels,y_valid))

### 3. Feature Selection
Note: need to consider which coloumns will have less effects on final prediction, so remove these columns and test again  


### 4.Selecting best model in scikit learn using " Cross Validation" 
###### by setting different values for random_state in "train_test_split()",  this will affect the accuracy in validation stage
Note: this is high variance estimor -> potential problem  
-> Solution: by setting a group of random_state for "train_test_split()", and average final accuracy results together -> essence of cross-validation


In [None]:
# An example to show the potential problem
import numpy as np
iris = load_iris()
X = iris.data
y = iris.target
accuracy = []
for i in range(100):
    x_train,x_valid,y_train,y_valid = train_test_split(X,y,random_state=i)
    knn = KNN(n_neighbors=5)
    knn.fit(x_train,y_train)
    y_predict = knn.predict(x_valid)
    accuracy .append(metrics.accuracy_score(y_predict,y_valid) )
plt.plot(np.arange(100),accuracy)
plt.xlabel(" random_state ")
plt.ylabel(" accuarcy ")
plt.show()

###### 4.1 K-fold Cross Validation
![image](K.png)
###### 4.2 Comparison K-fold Cross Validation and Train/Test Split
-> Advantages of cross_validation:  
. More accurate estimate of out of sample accuracy  
. More "efficient" use of data -- [ every observation is used for both training and validation stage ]  
-> Advantages of Train/Test split:  
. Run K times faster than K-fold cross-validation  
. Simpler to examine the detailed results of testing process

In [None]:
from sklearn.model_selection import KFold
import numpy as np
X = np.arange(25)
kf = KFold(n_splits=5,shuffle=False)
for train_index , test_index in kf.split(X):
    print('train_index:%s , test_index: %s ' %(train_index,test_index))

###### 4.3 Cross Validation Recommendation
1. K can be any number, but k=10 is greatly recommended  
2. For Classification problem, stratified sampling[分层抽样] is recommended for creating the folds  
. Each response class should be represented with equal proportions in each of the k folds  
. Scikit-learn's cross_val_score fucntion does this by default
###### "cross_val_score()"  working principle is " Steps for K-fold cross-validation"
######  relationship between " cross_val_score()" and " KFold()"
"cross_val_score()" cover KFold(), and more, cross_val_score() automatically send data to model knn,for example, and get scores as results
![image](2.png)

###### 4.3.1 Cross-Validation example: parameter tuning
Note: Goal: select the best tuning parameters (aka " hyperparameters") for KNN on the iris dataset  
-> result: k=20 offers better scores

In [None]:
from sklearn.model_selection import cross_val_score
iris = load_iris()
X = iris.data
y = iris.target
# 10 fold cross validation with k=5 for KNN
knn = KNN(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(scores)
print(scores.shape)
print(scores.mean())

In [None]:
# search for an optimal value of k for KNN
k_range = range(1,31)
k_scores = []
for k in k_range:
    knn = KNN(n_neighbors=k)
    k_scores.append(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())
plt.plot(k_range, k_scores)
plt.xlabel('K range')
plt.ylabel('K scores')
plt.show()

###### 4.3.2 Cross-Validation example: model selection
Note: Compare the best KNN model with logistic regression on the iris dataset  
-> result: KNN has better score

In [None]:
knn = KNN(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

In [None]:
from sklearn.linear_model import LogisticRegression
Logreg = LogisticRegression()
print(cross_val_score(Logreg, X, y, cv=10, scoring='accuracy').mean())

###### 4.3.3 Cross-Validation example: feature selection
Note: Select whether the Newspaper feature should be inculded inthe linear regression model on the advertising dataset  
-> result: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv("Dataset/Advertising.csv",index_col=0)
feature_cols = list(data.drop('Sales',axis=1).columns)
X=data[feature_cols]
y = data['Sales']
ln = LinearRegression()
score = cross_val_score(ln,X,y,cv=10,scoring='neg_mean_squared_error')
print(np.sqrt(-score))
print(np.sqrt(-score).mean())

In [None]:
feature_cols = list(data.drop(['Newspaper','Sales'],axis=1).columns)
X = data[feature_cols]
score = np.sqrt(-cross_val_score(ln, X, y, cv=10, scoring='neg_mean_squared_error').mean())
print(score)

### 5. Bootstrap

In [None]:
import numpy
from pandas import read_csv
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
# load dataset
data = read_csv('./Datasets/pima-indians-diabetes.data', header=None)
values = data.values
# configure bootstrap
n_iterations = 1000
n_size = int(len(data) * 0.50)
# run bootstrap
stats = list()
for i in range(n_iterations):
    # prepare train and test sets
    train = resample(values, n_samples=n_size)
    test = numpy.array([x for x in values if x.tolist() not in train.tolist()])
    # fit model
    model = DecisionTreeClassifier()
    model.fit(train[:,:-1], train[:,-1])
    # evaluate model
    predictions = model.predict(test[:,:-1])
    score = accuracy_score(test[:,-1], predictions)
    stats.append(score)
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, numpy.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, numpy.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))