In [2]:
import pandas as pd
data = pd.read_csv('breast_cancer.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
# removing id column
data.drop("id", axis = 'columns', inplace = True)

# removing Unnamed: 32 column
data.drop("Unnamed: 32", axis = 'columns', inplace = True)

In [4]:
# converting categorical target variable to numerical i.e. diagnosis
data.replace({"diagnosis" : {"M":1,"B":0}}, inplace = True)
data.diagnosis

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [5]:
# checking for relevant independent variables 
data.groupby("diagnosis").mean()

# based on the mean we remove:
# 1.) symmetry_mean
# 2.) fractal_dimension_mean
# 3.) smoothness_worst
# 4.) fractal_dimension_worst
# from our analysis

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442
1,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153


In [6]:
# dropping these variables from our data
data.drop(['symmetry_mean', 'fractal_dimension_mean', 'smoothness_worst', 'fractal_dimension_worst'], axis = 'columns', 
         inplace = True)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [9]:
x = data.drop('diagnosis', axis = 'columns')

In [10]:
y = data.diagnosis

In [33]:
from sklearn.model_selection import cross_val_score
lr_score = cross_val_score(LogisticRegression(max_iter = 3500), x,y, cv = 5)
lr_score

array([0.93859649, 0.94736842, 0.98245614, 0.92982456, 0.96460177])

In [34]:
rf_score = cross_val_score(RandomForestClassifier(n_estimators = 100), x,y, cv = 5)
rf_score

array([0.92105263, 0.93859649, 0.98245614, 0.97368421, 0.99115044])

In [35]:
dt_score = cross_val_score(DecisionTreeClassifier(), x,y, cv = 5)
dt_score

array([0.90350877, 0.9122807 , 0.92105263, 0.95614035, 0.92035398])

In [36]:
svm_score = cross_val_score(SVC(), x,y, cv = 5)
svm_score

array([0.85087719, 0.89473684, 0.92982456, 0.93859649, 0.9380531 ])

In [38]:
# taking mean of all scores and selecting the best algorithm
import numpy as np
print("Mean logistic regression score is: ",np.mean(lr_score))
print("Mean random forest score: ",np.mean(rf_score))
print("Mean decision tree score: ",np.mean(dt_score))
print("Mean support vector machine score: ",np.mean(svm_score))

Mean logistic regression score is:  0.9525694767893184
Mean random forest score:  0.9613879832324173
Mean decision tree score:  0.9226672876882471
Mean support vector machine score:  0.9104176370128861


# So the best algorithm for our dataset is Random Forest