# Introduction to Scikit-Learn (sklearn)

0. An end to end sklearn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions or our data
4. =>Evaluating the model 
5. Improve the model
6. Save and load trained model
7. Putting it all together!

## 4.2 Scoring Parameter

In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score

### 4.2.1 For Classification problem

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# load the dataset
heart_disease = pd.read_csv("dataset/heart-disease.csv")

np.random.seed(42)
# create features and labels
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']
 
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# choose the model
rfc = RandomForestClassifier()

# train the model
rfc.fit(X_train, y_train);

In [27]:
# Cross validate default ="accuracy"
np.random.seed(42)
cv_accuracy = cross_val_score(rfc, X, y, cv=5, scoring=None)
np.mean(cv_accuracy)

0.8248087431693989

In [28]:
# Cross validated accuracy
np.random.seed(42)
cv_accuracy = cross_val_score(rfc, X, y, cv=5, scoring="accuracy")
np.mean(cv_accuracy)

0.8248087431693989

In [29]:
# Cross validated precision
np.random.seed(42)
cv_precision = cross_val_score(rfc, X, y, cv=5, scoring="precision")
np.mean(cv_precision)

0.8329547346025924

In [30]:
# Cross validated recall
np.random.seed(42)
cv_recall = cross_val_score(rfc, X, y, cv=5, scoring="recall")
np.mean(cv_recall)

0.8545454545454545

In [31]:
# Cross validated f1
np.random.seed(42)
cv_f1 = cross_val_score(rfc, X, y, cv=5, scoring="f1")
np.mean(cv_f1)

0.8426854603423346

### 4.2.2 For Regression problem

In [34]:
# import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# load the dataset
boston = load_boston()
boston_df = pd.DataFrame(boston["data"], columns=boston['feature_names'])
boston_df["target"] = pd.Series(boston["target"])

# feature and target module
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

np.random.seed(42)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# choose the model
rfr = RandomForestRegressor()

# train the model
rfr.fit(X_train, y_train);

In [38]:
# default scoring method = 'r2'
np.random.seed(42)
cv_default = cross_val_score(rfr, X, y, scoring=None)
np.mean(cv_default)

0.6243870737930857

In [37]:
# mean absolute r2
np.random.seed(42)
cv_r2 = cross_val_score(rfr, X, y, scoring='r2')
np.mean(cv_r2) 

0.6243870737930857

In [40]:
# mean abs error
np.random.seed(42)
cv_mean_abs_error = cross_val_score(rfr, X, y, scoring='neg_mean_absolute_error')
cv_mean_abs_error

array([-2.11419608, -2.58716832, -3.33976238, -3.78563366, -3.32941584])

In [41]:
# mean squared error
np.random.seed(42)
cv_mean_squared_error = cross_val_score(rfr, X, y, scoring='neg_mean_squared_error')
cv_mean_squared_error

array([ -7.86326927, -12.71243178, -20.29089194, -45.93287403,
       -19.50774726])