# An end to end scikit learn workflow 

In [1]:
import pandas as pd
df=pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")
##This the the Github raw link of the data

In [2]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


**Getting ready with data**

In [3]:
#Create X feature Matrix
x=df.drop('target',axis=1)

#Create Y (labels)
y=df['target']

**Choose the right estimator/algorithm for our problems**

In [4]:
#Choose the right model and hyperparameters

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

**Fit the RF model to the data**

In [9]:
#Fit the model to the data
from sklearn.model_selection import train_test_split as tt
xtrain,xtest,ytrain,ytest=tt(x,y,test_size=0.2)

In [10]:
rf.fit(xtrain,ytrain)

RandomForestClassifier()

**Making predictions**

In [11]:
#Make a prediction
import numpy as np
ypreds=rf.predict(xtest)
ypreds

array([1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0], dtype=int64)

In [14]:
print(type(ypreds))
print(type(ytest))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


**Evaluating the model**

In [16]:
#Evaluate the model on train data and test data
rf.score(xtrain,ytrain)

1.0

In [18]:
rf.score(xtest,ytest)

0.8360655737704918

In [20]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error

In [22]:
print(classification_report(ytest,ypreds))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        28
           1       0.85      0.85      0.85        33

    accuracy                           0.84        61
   macro avg       0.83      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



In [23]:
confusion_matrix(ytest,ypreds)

array([[23,  5],
       [ 5, 28]], dtype=int64)

In [25]:
accuracy_score(ytest,ypreds)

0.8360655737704918

In [28]:
mean_squared_error(ytest,ypreds)

0.16393442622950818

So our model is 83% efficient

**Improve the model**

In [35]:
#trying with different hyperparameter (n_estimators in this case)
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying with model {i} estimator")
    rf=RandomForestClassifier(n_estimators=i).fit(xtrain,ytrain)
    print(f"Model accuracy here is : {rf.score(xtest,ytest)*100:.2f} % ")

Trying with model 10 estimator
Model accuracy here is : 83.61 % 
Trying with model 20 estimator
Model accuracy here is : 85.25 % 
Trying with model 30 estimator
Model accuracy here is : 83.61 % 
Trying with model 40 estimator
Model accuracy here is : 81.97 % 
Trying with model 50 estimator
Model accuracy here is : 80.33 % 
Trying with model 60 estimator
Model accuracy here is : 81.97 % 
Trying with model 70 estimator
Model accuracy here is : 81.97 % 
Trying with model 80 estimator
Model accuracy here is : 81.97 % 
Trying with model 90 estimator
Model accuracy here is : 85.25 % 


**Saving the model and loading it**

In [36]:
import pickle 

In [37]:
pickle.dump(rf,open("random_forest_model_1","wb"))

In [39]:
loaded_model=pickle.load(open("random_forest_model_1","rb"))

In [40]:
loaded_model

RandomForestClassifier(n_estimators=90)

In [41]:
loaded_model.score(xtest,ytest)

0.8524590163934426