In [10]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [11]:
df = pd.read_csv('heart.csv')

In [12]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [13]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [24]:
rf = RandomForestClassifier()
lr = LogisticRegression()
svc = SVC()

In [25]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8852459016393442

In [26]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [27]:
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.6721311475409836

In [28]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(), X, y, cv = 10, scoring='accuracy'))

np.float64(0.8479569892473119)

**GRID SEARCH CV (For Hyperparameter tuning) (tests all combination of parameters)(in this case : 4 * 3 * 3 * 3 = 108 random forest classifiers)**

In [45]:
n_estimators = [20, 60, 100, 120]
max_features = [0.2, 0.6, 1.0]
max_depth = [2, 8, None]
max_samples = [0.5, 0.75, 1.0]

In [46]:
param_grid = { 'n_estimators' : n_estimators,
                'max_features' : max_features,
                'max_depth' : max_depth,
                'max_samples' : max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [47]:
rf  = RandomForestClassifier()

In [34]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, param_grid= param_grid, cv = 5, verbose= 2, n_jobs = -1)

In [35]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [36]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 120}

In [37]:
rf_grid.best_score_

np.float64(0.8387755102040817)

**RANDOM SEARCH CV (Randomly selects the parameters values)(Used on large datasets and for too many hyperparameters, so as to reduce the time!!!)**

In [48]:
n_estimators = [20, 60, 100, 120]
max_features = [0.2, 0.6, 1.0]
max_depth = [2, 8, None]
bootstrap = [True, False]
max_samples = [0.5, 0.75, 1.0]
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]

In [49]:
param_grid = { 'n_estimators' : n_estimators,
                'max_features' : max_features,
                'max_depth' : max_depth,
                'max_samples' : max_samples,
                'bootstrap': bootstrap,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf 
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [50]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv = 5, verbose = 2, n_jobs=-1)

In [51]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 431, in fit
    raise ValueError(
    ...<3 lines>..

In [52]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 1.0,
 'max_features': 0.2,
 'max_depth': 8,
 'bootstrap': True}

In [54]:
rf_grid.best_score_

np.float64(0.8101190476190476)