In [64]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

Load the data

In [66]:
df = pd.read_csv('heart.csv')
df.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3,0
47,47,1,2,138,257,0,0,156,0,0.0,2,0,2,1
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,0
170,56,1,2,130,256,1,0,142,1,0.6,1,1,1,0
226,62,1,1,120,281,0,0,103,0,1.4,1,1,3,0


EDA

In [68]:
df.shape

(303, 14)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


##### select features and label

In [71]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [72]:
# split the data into training and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [73]:
# initialize models
rafc = RandomForestClassifier()
svc = SVC()
lor = LogisticRegression()

In [74]:
# training and accuracy of random forest classifier
rafc.fit(X_train, y_train)
pred1 = rafc.predict(X_test)
accuracy_score(y_test, pred1)

0.8524590163934426

In [75]:
# training and accuracy of support vector classification
svc.fit(X_train, y_train)
pred2 = svc.predict(X_test)
accuracy_score(y_test, pred2)

0.639344262295082

In [76]:
# training and accuracy of logistic regression
lor.fit(X_train, y_train)
pred3 = lor.predict(X_test)
accuracy_score(y_test, pred3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7868852459016393

In [77]:
# since our model only splits the data once, and trains, but we want to train and validate our models on all of the available data, 
# so we use cross-validation
cv1 = np.mean(cross_val_score(rafc, X, y, cv=10, scoring='accuracy'))
cv2 = np.mean(cross_val_score(svc, X, y, cv=10, scoring='accuracy'))
cv3 = np.mean(cross_val_score(lor, X, y, cv=10, scoring='accuracy'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [78]:
print(f"Cross_val_score of RandomForestClassifier: {cv1}")
print(f"Cross_val_score of SupportVectorClassification: {cv2}")
print(f"Cross_val_score of LogisticRegression: {cv3}")

Cross_val_score of RandomForestClassifier: 0.831505376344086
Cross_val_score of SupportVectorClassification: 0.6604301075268817
Cross_val_score of LogisticRegression: 0.8282795698924732


#### HyperParameter Tuning for Random Forest Classifier

In [117]:
# Number of trees in random forest
n_estimators = [20, 60, 100, 120]
# Maximum number of levels in tree
max_depth = [2, 8, 10, None]

# Number of features to consider at every split
max_features = [0.25, 0.50, 0.75, 1.0]

# Number of samples
max_samples = [0.25, 0.50, 0.75, 1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]


In [119]:
param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'max_samples': max_samples,
    'bootstrap': bootstrap,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_depth': [2, 8, 10, None], 'max_features': [0.25, 0.5, 0.75, 1.0], 'max_samples': [0.25, 0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [82]:
grid_cv = GridSearchCV(
    estimator = rafc,
    param_grid=param_grid,
    cv=5,
    verbose=2,
    n_jobs=-1
)

In [103]:
# to train the model again with different parameters, to find the best model
grid_cv.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [135]:
grid_cv.best_score_

0.8592687074829932

In [111]:
rafc2 = RandomForestClassifier(max_features=0.5, max_samples=0.25, n_estimators=120)

In [127]:
# randomizedSearchCV, faster because randomly chooses the parameters to test, can be helpful for larger datasets which can give best closest result

random_cv = RandomizedSearchCV(
    estimator=rafc,
    param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)


In [131]:
random_cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\12368\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\12368\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\12368\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 433, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `ma

In [133]:
random_cv.best_params_

{'n_estimators': 60,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 1.0,
 'max_features': 0.5,
 'max_depth': 2,
 'bootstrap': True}