In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("/content/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
df.shape

(1025, 14)

In [4]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(820, 13)
(205, 13)


In [8]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [9]:
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Gradient Booting without GridSearchCV is: ", accuracy)

Accuracy of Gradient Booting without GridSearchCV is:  0.9317073170731708


In [10]:
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of SVC without GridSearchCV is: ", accuracy)

Accuracy of SVC without GridSearchCV is:  0.6829268292682927


In [11]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of LogisticRegression without GridSearchCV is: ", accuracy)

Accuracy of LogisticRegression without GridSearchCV is:  0.7853658536585366


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy without GridSearchCV is: ", accuracy)

Accuracy without GridSearchCV is:  0.9853658536585366


In [13]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(), X, y, cv = 10, scoring = 'accuracy'))

0.9970588235294118

# **`GridSearchCV`**

In [14]:
n_estimators = [20, 60, 100, 120]
max_features = [0.2, 0.6, 1.0]
max_depth = [2, 8, None]
max_samples = [ 0.5, 0.75, 1.0]

In [22]:
search_space = {
    "n_estimators":n_estimators,
    "max_features":max_features,
    "max_depth": max_depth,
    "max_samples":max_samples
}

In [23]:
rf = RandomForestClassifier()

In [24]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, 
                       param_grid = search_space,
                       cv = 5,
                       verbose = 4, 
                       n_jobs = 1)

In [25]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=20;, score=0.866 total time=   0.0s
[CV 2/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=20;, score=0.866 total time=   0.0s
[CV 3/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=20;, score=0.848 total time=   0.0s
[CV 4/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=20;, score=0.890 total time=   0.0s
[CV 5/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=20;, score=0.811 total time=   0.0s
[CV 1/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=60;, score=0.860 total time=   0.1s
[CV 2/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=60;, score=0.872 total time=   0.1s
[CV 3/5] END max_depth=2, max_features=0.2, max_samples=0.5, n_estimators=60;, score=0.872 total time=   0.1s
[CV 4/5] END max_depth=2, max_features=0.2, max_samples=0

In [27]:
print("Best estimators: ", rf_grid.best_estimator_, end = "\n----------------------------------\n")
print("Best parameters: ", rf_grid.best_params_, end = "\n----------------------------------\n")
print("Best score: ", rf_grid.best_score_)


Best estimators:  RandomForestClassifier(max_features=0.2, max_samples=0.75, n_estimators=60)
----------------------------------
Best parameters:  {'max_depth': None, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 60}
----------------------------------
Best score:  0.9829268292682926
