# Scikit-Learn 
- built on numpy and matplotlib
- has many built-in ML models
- methods to evaluate ML models
- well designed API


**Workflow**:

    1.) Get Data Ready
    2.) Pick an Estimator/Algorithm for the problem Model
    3.) Fit the Model/Algorithm
    4.) Evaluate the Model
    5.) Experiment/Compare/Improve Model
    6.) Save and Load Model to an Application

## Heart Disease

In [3]:
import pandas as pd
import numpy as np

In [None]:
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")
heart_disease

In [None]:
# Create X (features matrix)
X = heart_disease.drop('target', axis=1)

# Create y (labels)
y = heart_disease["target"]

### Splitting the Data

    1. Split data into features and labels
    2. Filling/inputing/disregarding missing values
    3. Convert non-numerical values to numerical values (a.k.a. feature encoding)

In [None]:
# 1.
# Always split dev, val, test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# 2.


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()

# Keep default hyperparams for now
clf.get_params()

### Fit the Model/Algorithm

In [None]:
clf.fit(X_train, y_train)

In [None]:
# Make a prediction
y_preds = clf.predict(X_test)
y_preds

In [None]:
y_test

### Evaluate the Model

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

### Improve Model

In [None]:
# Tweak hyperparams
np.random.seed(42)

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:0.2f} %");
    print("")

### Save and Load Model

In [None]:
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

## Car Sales (OHE)

In [None]:
car_sales = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended.csv")
car_sales

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

#### Get Features/Label and Split Dataset

In [None]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Preprocess (fill NAs and turn categorical values to numerical)

In [None]:
 # OHE Method 1
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                remainder="passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
pd.DataFrame(transformed_X)

In [None]:
# OHE Method 2
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

#### Choose an Estimator/Algorithm

In [None]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model = RandomForestRegressor()

#### Fit the Algorithm

In [None]:
model.fit(X_train, y_train)

#### Evaluate the Algorithm

In [None]:
model.score(X_test, y_test)

## Car Sales (Handle Missing Data)

    1. Fill missing data with some value (imputation)
    2. Remove samples with missing data altogether

In [None]:
# import car sales missing data
car_sales_missing = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
car_sales_missing

#### Check and Fill Missing Values

In [None]:
car_sales_missing.isna().sum()

##### Fill Na Option 1: with Pandas

In [None]:
# fill "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

# fill "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

# fill "Odometer (KM) column"
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

# fill "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)


In [None]:
car_sales_missing.isna().sum()

In [None]:
# remove rows with missing "Price" value
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

#### Create Features/Label

In [None]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# OHE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                one_hot,
                                categorical_features)], 
                               remainder = "passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X

##### Fill na with Option 2: with Scikit-learn


In [None]:
car_sales_missing = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(subset=["Price"], inplace=True) # drop nan values that are in the subset of price column
car_sales_missing.isna().sum()

#### Create Features/Label

In [None]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

#### Fill Missing Values

In [None]:
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy = "constant", fill_value = "missing")
door_imputer = SimpleImputer(strategy = "constant", fill_value = 4)
num_imputer = SimpleImputer(strategy = "mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data
filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X, 
                               columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

#### Convert Categorical into Numbers

In [None]:
# OHE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                one_hot,
                                categorical_features)], 
                               remainder = "passthrough")
transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

#### Split Dataset

In [None]:
np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

#### Choosing the Right Algorithm/Model

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## CA Housing Dataset (Choosing a Regression Estimator/Algorithm/ML Model)

- Classification problem: tasked with predicitng a category (e.g. heart disease or not)
- Regression problem: tasked with predicting a number (e.g. price of a car)

Choosing your estimator roadmap:

![image.png](attachment:image.png)

In [1]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])

In [5]:
housing_df['target'] = housing['target']
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


#### Split Dataset

In [7]:
from sklearn.model_selection import train_test_split
np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"] # median house price in 100,000s

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Choosing the Model (Regression)

Follow that roadmap pic from above to decide which models to consider. 
Since we don't know which model is best, and what features are most important, try different estimators/models/algorithms to see which ones perform the best.

##### Try Ridge Regression (Fit, Evaluate)

In [8]:
from sklearn.linear_model import Ridge

model = Ridge()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.5758549611440126

##### Try Ensemble Regressors

An Ensemble is a combination of smaller models that tries to make one prediction. This is supposedly better than just one single model making a prediction.

In [9]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8065734772187598

##### Try Lasso

In [10]:
from sklearn import linear_model
model = linear_model.Lasso(alpha=0.1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.5318167610318161

#### Try ElasticNet

In [11]:
model = linear_model.ElasticNet(random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.41655189098028234

##### Try SGD Regressor

In [12]:
model = linear_model.SGDRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

-7.457601471698282e+28

## Heart Disease (Choosing an Classification Estimator)

In [16]:
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [17]:
len(heart_disease)

303

### Create Features/Label and Split Dataset

In [19]:
from sklearn.model_selection import train_test_split

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Choose an Estimator (Fit, Evaluate)

#### Try Linear SVC

In [24]:
from sklearn.svm import LinearSVC
np.random.seed(42)

model = LinearSVC()
model.fit(X_train, y_train)
model.score(X_test, y_test)



0.8524590163934426

#### Try Ensemble Classifiers

In [27]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8032786885245902

**If you have structured data, use ensemble methods. If you have unstructured data, use deep learning or transfer learning.**

## Heart Disease (Fitting Model to Data)

You know what to call on scikit-learn to fit the data, but you should be able to descibe/showcase what is under the hood of `fit()`. 

Make sure you understand the concept of forward pass and backpropagation. Be prepared to write it out in scratch, and do some math. 

```
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

model = RandomForestClassifier()
```

**model.fit(X_train, y_train)**
```
model.score(X_test, y_test)
```

### Make Classification Predictions with the Model

#### Option 1: `predict()`

In [30]:
X_test.head(), X_test.shape

(     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak   
 233   64    1   0       120   246    0        0       96      1      2.2  \
 30    41    0   1       105   198    0        1      168      0      0.0   
 279   61    1   0       138   166    0        0      125      1      3.6   
 81    45    1   1       128   308    0        0      170      0      0.0   
 229   64    1   2       125   309    0        1      131      1      1.8   
 
      slope  ca  thal  
 233      0   1     2  
 30       2   1     2  
 279      1   1     2  
 81       2   0     2  
 229      1   0     3  ,
 (61, 13))

In [31]:
model.predict(X_test) # this is our prediction for X_test array

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1], dtype=int64)

In [34]:
# Compare predictions to truth labels
y_preds = model.predict(X_test)

# the following is the equivalent of: 
# model.score(X_test, y_test)
np.mean(y_preds == y_test) # get the average of correct predictions between y_preds and y_test

0.8032786885245902

In [35]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_preds) # still the equivalent. 

0.8032786885245902

#### Option 2: `predict_proba()`

This returns probabilities of a classification label.
Useful to use if you only want to know which labels have a probability higher than X%. Maybe good for debugging/experimenting/tweaking the model.

In [39]:
model.predict_proba(X_test[:5]) # gives you the probability percentage for each label (no heart disease, yes heart disease)

array([[0.89, 0.11],
       [0.16, 0.84],
       [0.99, 0.01],
       [0.05, 0.95],
       [0.67, 0.33]])

In [38]:
model.predict(X_test[:5]) # just gives you the prediction label 

array([0, 1, 0, 1, 0], dtype=int64)

## CA Housing Market (Making Regression Predictions with Our Model)

In [41]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [43]:
housing_df = pd.DataFrame(housing['data'], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [44]:
housing_df["target"] = housing["target"]
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8065734772187598

In [51]:
model.predict(X_test)

array([0.49384  , 0.75494  , 4.9285964, ..., 4.8363785, 0.71782  ,
       1.67901  ])

In [54]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_preds) # on avg, each one of the model predicts is 0.32 different from the actual test value. 

0.32659871732073664

## Evaluating ML Model Score

The 3 ways to evaluate an ML model:

    1. Estimator's build-in `score()` method
    2. The `scoring` parameter
    3. Problem-specific metric funcitons

####  built-in score() for classification

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [58]:
model.score(X_train, y_train) # The highest value is 1.0, lowest value is 0.0

1.0

In [59]:
model.score(X_test, y_test)

0.8524590163934426

#### build-in score() method for regression problem

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = housing_df.drop("target", axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.predict(X_test)

array([1.7471   , 1.23883  , 2.3511001, ..., 2.8758201, 1.60219  ,
       1.84951  ])

In [61]:
model.score(X_train, y_train)

0.9729067522564939

In [62]:
model.score(X_test, y_test)

0.8095334567787089

The default score() evaluation metrics is R_squared for regression algorithms 

Highest = 1.0, lowest = 0.0

#### Scoring Parameters

![image.png](attachment:image.png)

In [63]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train);


In [64]:
model.score(X_test, y_test)

0.8524590163934426

In [65]:
cross_val_score(model, X, y) # 5 different scores for 5 different versions of the training data

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])

In [66]:
np.random.seed(42)

# get single training and test split score
model_single_score = model.score(X_test, y_test)

# take the mean of 5-fold CV
model_cross_val_score = np.mean(cross_val_score(model, X, y)) # default scoring parameter = mean accuracy

# compare the two
model_single_score, model_cross_val_score # you should prefer the CV metric over the single score

(0.8524590163934426, 0.8248087431693989)

### Classification Evaluation Metrics -- Accuracy
1. Accuracy
2. Area under ROC curve
3. Confusion Matrix
4. Classification Report




#### Accuracy

In [68]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

model = RandomForestClassifier()
cross_val_score = cross_val_score(model, X, y)

In [71]:
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) * 100} %")
print(f"this score shows that-- given a sample the model hasn't seen before, it is {np.mean(cross_val_score) * 100}% likely to predict the right label.")

Heart Disease Classifier Cross-Validated Accuracy: 82.48087431693989 %
this score shows that-- given a sample the model hasn't seen before, it is 82.48087431693989% likely to predict the right label.


#### ROC (a.k.a. AUC)

Area under the receiver operating characteric curve

ROC is a comparison of the model's true positive rate (TPR) Vs false positive rate (FPR). 

True Positive = model predicts 1 when truth is 1
True Negative = model predicts 0 when truth is 0
False Positive = model predicts 1 when truth is 0
False Negative = model predicts 0 when truth is 1

In [None]:
from sklearn.metrics import roc_curve
