In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [42]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
%matplotlib inline

## Importing the Dataset

In [3]:
df = pd.read_csv('../input/breast-cancer-dataset/breast-cancer.csv')
df.head()

## Exploratory Data Analysis

In [4]:
df.info()

### Checking correlation graph

In [5]:
import matplotlib 
plt.figure(figsize=(20,18))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')

### Plotting variation between Mean Radius and Mean Texture 

In [6]:
sns.scatterplot(x=df['radius_mean'],y=df['texture_mean'],color='orange')

### Plotting variation between Mean Concavity and Mean of (Concave Points)

In [7]:
sns.scatterplot(x=df['concavity_mean'],y=df['concave points_mean'])

### Plotting variation of Mean Compactness and Mean Concavity

In [8]:
sns.scatterplot(x=df['compactness_mean'],y=df['concavity_mean'],color='green')

### Plotting variation between Mean Smoothness and Mean Symmetry

In [9]:
sns.scatterplot(x=df['smoothness_mean'],y=df['symmetry_mean'],color='red')

### Plotting Variation of Mean Compactness and Mean Fractal Dimension

In [10]:
sns.scatterplot(x=df['compactness_mean'],y=df['fractal_dimension_mean'],color='purple')

### Plotting variation of Mean area and Mean of ( Concave Points )

In [11]:
sns.scatterplot(x=df['area_mean'],y=df['concave points_mean'],color='dodgerblue')

### Plotting variation of Mean smoothness and Mean Fractal Dimension

In [12]:
sns.scatterplot(x=df['smoothness_mean'],y=df['fractal_dimension_mean'],color='brown')

### Plotting variation Mean symmetry and Mean of ( Concave Points)

In [13]:
sns.scatterplot(x=df['symmetry_mean'],y=df['concave points_mean'],color='pink')

## Feature Engineering

### Checking Null Values

In [14]:
df.isnull().sum()

### Encoding Categorical Variable Diagnosis

In [16]:
df['diagnosis'].unique()

In [18]:
def change(col):
    if col=='M':
        return 1
    else:
        return 0
df['diagnosis'] = df['diagnosis'].apply(change)

In [19]:
df.head()

### Dropping id Column

In [21]:
df.drop(['id'],axis=1,inplace=True)

In [22]:
df.head()

## Dividing Dataset into X and Y

In [23]:
X = df.iloc[:,1:].values
y = df.iloc[:,0].values

## Dividing Dataset into Training and Test set

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling

In [25]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training Logistic Regression

In [26]:
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, y_train)

In [27]:
y_pred1 = classifier1.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)
print(accuracy_score(y_test, y_pred1))

## Training KNN

In [28]:
classifier2 = KNeighborsClassifier(n_neighbors = 5,metric='minkowski',p=2)
classifier2.fit(X_train, y_train)

In [29]:
y_pred2 = classifier2.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print(accuracy_score(y_test, y_pred2))

## Training Naive Bayes

In [30]:
classifier3 = GaussianNB()
classifier3.fit(X_train, y_train)

In [31]:
y_pred3 = classifier3.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)
print(accuracy_score(y_test, y_pred3))

## Training Kernel SVM

In [32]:
classifier4 = SVC(kernel = 'rbf', random_state = 0)
classifier4.fit(X_train, y_train)

In [33]:
y_pred4 = classifier4.predict(X_test)
cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)
print(accuracy_score(y_test, y_pred4))

## Training Decision Tree 

In [34]:
classifier5 = DecisionTreeClassifier(random_state = 0,criterion='entropy')
classifier5.fit(X_train, y_train)

In [35]:
y_pred5 = classifier5.predict(X_test)
cm5 = confusion_matrix(y_test, y_pred5)
print(cm5)
print(accuracy_score(y_test, y_pred5))

## Training Random Forest 

In [36]:
classifier6 = RandomForestClassifier(criterion = 'entropy',random_state = 0,n_estimators = 10)
classifier6.fit(X_train, y_train)

In [37]:
y_pred6 = classifier6.predict(X_test)
cm6 = confusion_matrix(y_test, y_pred6)
print(cm6)
print(accuracy_score(y_test, y_pred6))

## Training XGBoost Classifier

In [38]:
classifier7 = XGBClassifier()
classifier7.fit(X_train, y_train)

In [39]:
y_pred7 = classifier7.predict(X_test)
cm7 = confusion_matrix(y_test, y_pred7)
print(cm7)
print(accuracy_score(y_test, y_pred7))


## Performing K4 Cross Validation

In [41]:
accuracies = cross_val_score(estimator = classifier7, X = X_train, y = y_train, cv = 50)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))


## Performing Grid Search CV

In [44]:
parameters = [{'C':[0.25,0.5,0.75,1.0],'kernel':['linear']},
              {'C':[0.25,0.5,0.75,1.0],'kernel':['rbf'],'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}]
grid_search = GridSearchCV(estimator = classifier4,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Standard Deviation: ",best_parameters)

# Therefore, we can see that we get the best accuracy on Kernel SVM & XGBoost

## Our best accuracy is 98.24%

### Please upvote my notebook if you like it. It would mean a lot!!!