# Scikit-Learn course 4
# III Making predictions with a model

reminder : 
* `X` = features, features variables, data
* `y` = labels, target, target variables

## 0. Standards import

In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline 

In [4]:
# ---> y (axis=1)
# |
# |
# x (axis=0)

## 1. Making predictions with a model : Classifier

### 1.1 Fitting a model to the data

In [5]:
# Import the RandomForestClassifier model class from the ensemble module
from sklearn.ensemble import RandomForestClassifier

# Importing data :
heart_disease = pd.read_csv("data/heart-disease.csv")

# Split the data into X (features/data) and y (target/labels)
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the model (on the training set) (call clf)
clf = RandomForestClassifier()

# Call the fit method on the model and pass it training data
clf.fit(X_train, y_train)

# Check the score of the model (on the test set)
clf.score(X_test, y_test)

0.7704918032786885

### 1.2 Predictions
2 ways to make predictions :
1. `predict()`
2. `predict_proba()`

In [6]:
y_preds = clf.predict(X_test)
y_preds

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0])

In [7]:
np.array(y_test)

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0])

In [8]:
# compare prediction to truth labels to evaluate the model
np.mean(y_preds == y_test) # same as clf.score for a classification probleme

0.7704918032786885

In [9]:
clf.score(X_test, y_test)

0.7704918032786885

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.7704918032786885

Make prediction using `predict_proba()`
<br>
`predict_proba()` returns probabilities of a classification label

In [11]:
clf.predict_proba(X_test)[:10]

array([[0.12, 0.88],
       [0.22, 0.78],
       [0.3 , 0.7 ],
       [0.14, 0.86],
       [0.  , 1.  ],
       [0.93, 0.07],
       [0.22, 0.78],
       [0.98, 0.02],
       [0.98, 0.02],
       [0.  , 1.  ]])

In [12]:
# Let's predict() on the same data 
clf.predict(X_test[:10])

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1])

if the first one on `predict_proba` is [0.66, 0.34]
<br>
then the first one on `predict` is 0
<br><br>
then we understand `predict_proba` return the probabilities for ech prediction to be 1 AND 0 (for this data)
<br> 
for the first one, the proba to be 0 is 66% and the proba to be 1 is 34%(`predict_proba`) ; then `predict` return us 0

## 2. Making predictions with a model : Regressor

### 2.1 Fitting a model to the data

In [13]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df["target"] = pd.Series(housing["target"])

X = housing_df.drop("target", axis=1)
y = housing_df.target # median house price in $100k

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

model = RandomForestRegressor(n_estimators=100)
# n_estimators is the number of different model that RandomForestRegressor is gonna use

model.fit(X_train,y_train)
model.score(X_test,y_test)

0.807352481049985

### 2.2 Predictions

In [14]:
y_preds = model.predict(X_test)
y_preds[:10]

array([1.01404  , 0.952    , 1.30071  , 1.78206  , 2.6553905, 1.12759  ,
       0.85905  , 2.01749  , 1.09058  , 1.9647802])

In [15]:
np.array(y_test)[:10]

array([0.949, 0.722, 0.965, 1.688, 3.238, 1.064, 1.125, 1.   , 1.188,
       2.298])

In [16]:
# calculate the mean difference between the prediction (y_preds) and truth (y_test)
# lower is best
np.mean(np.abs(y_preds[:10] - np.array(y_test)[:10]))

0.3085089299999996

In [17]:
# there exists a function in sklearn that already does that :
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_preds[:10], np.array(y_test)[:10])

0.3085089299999996

In [18]:
percentage_error_array = y_preds[:10]*100/np.array(y_test)[:10]
percentage_error_array

array([106.85353003, 131.85595568, 134.78860104, 105.57227488,
        82.00711859, 105.97650376,  76.36      , 201.749     ,
        91.7996633 ,  85.49957354])

In [19]:
np.mean(percentage_error_array)

112.24622208210641

In [20]:
# 3h27