In [1]:
# Loading Libraries

import pandas as pd 
from matplotlib import pyplot as plt 
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("breast_cancer_data.csv")
data.drop(['id','Unnamed: 32'],axis=1,inplace=True)
# map function for mapping benign into 0 and malignant into 1
data['diagnosis'] = data['diagnosis'].map({'B':0,'M':1})

X=data.drop(['diagnosis'],axis=1)
y=data['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)

In [3]:
X_train.shape

(381, 30)

In [None]:
# Initializing GridSearchCV() object and fitting it with hyperparameters

log = LogisticRegression()
rnd = RandomForestClassifier()
adaboost = AdaBoostClassifier()
gBoost = GradientBoostingClassifier()
svm = SVC()

# random forest

forest_hyperparams = [{'n_estimators':[50,80,100,150,200,300,400,500],'criterion':['gini','entropy','log_loss'],'max_depth':list(range(10,15)),'max_features':list(range(5,20))}]

# forest_hyperparams is a dictionary where keys are parameters and values are their values respective for each parameters.
# n_estimators is the number of trees in a  forest. Default value is 100. 
# max_features is the number of columns taken for making a single tree. columns will be taken randomly and columns can go up to full columns in the dataset. 
# Make sure to check the number of columns in the data set before giving value for max_features.

grid_clf = GridSearchCV(rnd,forest_hyperparams,cv=10,scoring='accuracy')

# Here RandomForestClassifier is taken. forest_hyperparams is taken to give in GridSearchCV. cv is cross validation generator.
# If cv is not given, it will take as none and take default 5-fold cross validation.
# scoring is to specify the parameters must be choosed based on accuracy, recall, precision, etc. Eg: For regression problems, scoring must be based on accuracy.
# Classification problems can go for accuracy, precision or recall.
grid_clf.fit(X_train,y_train)


In [4]:
list(range(10,15))

[10, 11, 12, 13, 14]

In [None]:
print(grid_clf.best_params_)                    # Will give best parameters for each keys in forest_hyperparams, for doing Random Forest         
print(grid_clf.best_score_)                     # Will print score with the best parameters.