#Introduction to Scikit-Learn (sklearn)

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [6]:
# Split data into X & y
X = heart_disease.drop("target", axis=1) # use all columns except target
y = heart_disease["target"] # we want to predict y using X

In [7]:
# Random Forest Classifier (for classification problems)
from sklearn.ensemble import RandomForestClassifier
# Instantiating a Random Forest Classifier (clf short for classifier)
clf = RandomForestClassifier()

In [8]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [10]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
# Example use case (requires X & y)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [12]:
# All models/estimators have the fit() function built-in
clf.fit(X_train, y_train)
# Once fit is called, you can make predictions using predict()
y_preds = clf.predict(X_test)

# You can also predict with probabilities (on classification models)
y_probs = clf.predict_proba(X_test)

# View preds/probabilities
y_preds, y_probs

(array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1]),
 array([[0.98, 0.02],
        [0.83, 0.17],
        [0.89, 0.11],
        [0.99, 0.01],
        [0.63, 0.37],
        [0.79, 0.21],
        [0.06, 0.94],
        [0.88, 0.12],
        [0.04, 0.96],
        [0.62, 0.38],
        [0.43, 0.57],
        [0.59, 0.41],
        [0.86, 0.14],
        [0.23, 0.77],
        [0.96, 0.04],
        [0.79, 0.21],
        [0.13, 0.87],
        [0.18, 0.82],
        [0.07, 0.93],
        [0.11, 0.89],
        [0.24, 0.76],
        [0.55, 0.45],
        [0.39, 0.61],
        [0.72, 0.28],
        [0.25, 0.75],
        [0.56, 0.44],
        [0.1 , 0.9 ],
        [0.34, 0.66],
        [0.93, 0.07],
        [0.96, 0.04],
        [0.38, 0.62],
        [0.67, 0.33],
        [0.04, 0.96],
        [0.04, 0.96],
        [0.4 , 0.6 ],
        [0.34, 0.66],

In [13]:
# All models/estimators have a score() function
clf.score(X_test, y_test)

0.7868852459016393

In [15]:
clf.score(X_train, y_train)

1.0

In [14]:
# Different classification metrics

# Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_preds))

# Reciver Operating Characteristic (ROC curve)/Area under curve (AUC)
from sklearn.metrics import roc_curve, roc_auc_score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_probs[:, 1])
print(roc_auc_score(y_test, y_preds))

# Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_preds))

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

0.7868852459016393
0.7922077922077922
[[24  4]
 [ 9 24]]
              precision    recall  f1-score   support

           0       0.73      0.86      0.79        28
           1       0.86      0.73      0.79        33

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.80      0.79      0.79        61



In [19]:
np.random.seed(42)
for i in range(10,100,10):
    clf=RandomForestClassifier(n_estimators=i).fit(X_train,y_train)
    print(f"Model accuracy on test set: {clf.score(X_test,y_test)*100}")
    print("")

Model accuracy on test set: 75.40983606557377

Model accuracy on test set: 77.04918032786885

Model accuracy on test set: 80.32786885245902

Model accuracy on test set: 86.88524590163934

Model accuracy on test set: 80.32786885245902

Model accuracy on test set: 78.68852459016394

Model accuracy on test set: 85.24590163934425

Model accuracy on test set: 75.40983606557377

Model accuracy on test set: 80.32786885245902



In [20]:
# Saving a model with pickle
import pickle

# Save an existing model to file
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))
# Load a saved pickle model
loaded_pickle_model = pickle.load(open("random_forest_model_1.pkl", "rb"))

# Evaluate loaded model
loaded_pickle_model.score(X_test, y_test)

0.8032786885245902

In [23]:
import sklearn
sklearn.show_versions()


System:
    python: 3.14.2 | packaged by Anaconda, Inc. | (main, Dec 19 2025, 11:42:52) [Clang 20.1.8 ]
executable: /Users/aravindsama/Desktop/data_science_project/DataScienceProjects/sample-project/env/bin/python
   machine: macOS-26.2-arm64-arm-64bit-Mach-O

Python dependencies:
      sklearn: 1.8.0
          pip: 26.0.1
   setuptools: 80.10.2
        numpy: 2.4.2
        scipy: 1.16.3
       Cython: None
       pandas: 3.0.0
   matplotlib: 3.10.8
       joblib: 1.5.3
threadpoolctl: 3.5.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: openblas
    num_threads: 8
         prefix: libopenblas
       filepath: /Users/aravindsama/Desktop/data_science_project/DataScienceProjects/sample-project/env/lib/libopenblasp-r0.3.31.dylib
        version: 0.3.31
threading_layer: pthreads
   architecture: neoversen1

       user_api: openmp
   internal_api: openmp
    num_threads: 8
         prefix: libomp
       filepath: /Users/aravindsama/Desktop/data_science