# Import  Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree

# Load the Datasets

In [2]:
data=pd.read_csv("breast_cancer_survival.csv")

In [3]:
data.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,42,FEMALE,0.95256,2.15,0.007972,-0.04834,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,20-May-18,26-Aug-18,Alive
1,54,FEMALE,0.0,1.3802,-0.49803,-0.50732,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,26-Apr-18,25-Jan-19,Dead
2,63,FEMALE,-0.52303,1.764,-0.37019,0.010815,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,24-Aug-18,08-Apr-20,Alive
3,78,FEMALE,-0.87618,0.12943,-0.37038,0.13219,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,16-Nov-18,28-Jul-20,Alive
4,42,FEMALE,0.22611,1.7491,-0.54397,-0.39021,II,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,12-Dec-18,05-Jan-19,Alive


# Check the null value present or not

In [4]:
data.isnull().sum()

Age                    0
Gender                 0
Protein1               0
Protein2               0
Protein3               0
Protein4               0
Tumour_Stage           0
Histology              0
ER status              0
PR status              0
HER2 status            0
Surgery_type           0
Date_of_Surgery        0
Date_of_Last_Visit    17
Patient_Status        13
dtype: int64

In [5]:
data=data.dropna()

In [6]:
data.isnull().sum()

Age                   0
Gender                0
Protein1              0
Protein2              0
Protein3              0
Protein4              0
Tumour_Stage          0
Histology             0
ER status             0
PR status             0
HER2 status           0
Surgery_type          0
Date_of_Surgery       0
Date_of_Last_Visit    0
Patient_Status        0
dtype: int64

# Converting Categorical to Numerical values

In [7]:
data["Tumour_Stage"]=data["Tumour_Stage"].replace("I",1)
data["Tumour_Stage"]=data["Tumour_Stage"].replace("II",2)
data["Tumour_Stage"]=data["Tumour_Stage"].replace("III",3)

In [8]:
data

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,42,FEMALE,0.952560,2.15000,0.007972,-0.048340,2,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,20-May-18,26-Aug-18,Alive
1,54,FEMALE,0.000000,1.38020,-0.498030,-0.507320,2,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,26-Apr-18,25-Jan-19,Dead
2,63,FEMALE,-0.523030,1.76400,-0.370190,0.010815,2,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,24-Aug-18,08-Apr-20,Alive
3,78,FEMALE,-0.876180,0.12943,-0.370380,0.132190,1,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,16-Nov-18,28-Jul-20,Alive
4,42,FEMALE,0.226110,1.74910,-0.543970,-0.390210,2,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,12-Dec-18,05-Jan-19,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,59,FEMALE,0.024598,1.40050,0.024751,0.280320,2,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,15-Jan-19,27-Mar-20,Alive
330,41,FEMALE,0.100120,-0.46547,0.472370,-0.523870,1,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Modified Radical Mastectomy,25-Jul-18,23-Apr-19,Alive
331,54,FEMALE,0.753820,1.64250,-0.332850,0.857860,2,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Simple Mastectomy,26-Mar-19,11-Oct-19,Dead
332,74,FEMALE,0.972510,1.42680,-0.366570,-0.107820,2,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Nov-18,05-Dec-18,Alive


In [9]:
data["ER status"]=data["ER status"].replace("Positive",1)
data["ER status"]=data["ER status"].replace("Negative",2)


In [10]:
data["PR status"]=data["PR status"].replace("Positive",1)
data["PR status"]=data["PR status"].replace("Negative",2)


In [11]:
data["HER2 status"]=data["HER2 status"].replace("Positive",1)
data["HER2 status"]=data["HER2 status"].replace("Negative",2)


In [12]:
data["Patient_Status"]=data["Patient_Status"].replace("Alive",0)
data["Patient_Status"]=data["Patient_Status"].replace("Dead",1)


In [13]:
data.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,42,FEMALE,0.95256,2.15,0.007972,-0.04834,2,Infiltrating Ductal Carcinoma,1,1,2,Other,20-May-18,26-Aug-18,0
1,54,FEMALE,0.0,1.3802,-0.49803,-0.50732,2,Infiltrating Ductal Carcinoma,1,1,2,Other,26-Apr-18,25-Jan-19,1
2,63,FEMALE,-0.52303,1.764,-0.37019,0.010815,2,Infiltrating Ductal Carcinoma,1,1,2,Lumpectomy,24-Aug-18,08-Apr-20,0
3,78,FEMALE,-0.87618,0.12943,-0.37038,0.13219,1,Infiltrating Ductal Carcinoma,1,1,2,Other,16-Nov-18,28-Jul-20,0
4,42,FEMALE,0.22611,1.7491,-0.54397,-0.39021,2,Infiltrating Ductal Carcinoma,1,1,1,Lumpectomy,12-Dec-18,05-Jan-19,0


In [14]:
data["Patient_Status"].value_counts()

0    255
1     62
Name: Patient_Status, dtype: int64

# Feature Selection

In [15]:
def correlation(data,threshold):
    col_corr=set()
    cor_matrix=data.corr()
    for i in range(len(cor_matrix.columns)):
        for j in range(i):
            if abs(cor_matrix.iloc[i,j])>threshold:
                colname=cor_matrix.columns[i]
                col_corr.add(colname)
    return col_corr
                

In [16]:
x=data[["Age","Protein1","Protein2","Protein3","Protein4","Tumour_Stage","ER status","PR status","HER2 status"]]

In [17]:
y=data[["Patient_Status"]]

In [18]:
y

Unnamed: 0,Patient_Status
0,0
1,1
2,0
3,0
4,0
...,...
329,0
330,0
331,1
332,0


In [19]:
x

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,ER status,PR status,HER2 status
0,42,0.952560,2.15000,0.007972,-0.048340,2,1,1,2
1,54,0.000000,1.38020,-0.498030,-0.507320,2,1,1,2
2,63,-0.523030,1.76400,-0.370190,0.010815,2,1,1,2
3,78,-0.876180,0.12943,-0.370380,0.132190,1,1,1,2
4,42,0.226110,1.74910,-0.543970,-0.390210,2,1,1,1
...,...,...,...,...,...,...,...,...,...
329,59,0.024598,1.40050,0.024751,0.280320,2,1,1,1
330,41,0.100120,-0.46547,0.472370,-0.523870,1,1,1,1
331,54,0.753820,1.64250,-0.332850,0.857860,2,1,1,2
332,74,0.972510,1.42680,-0.366570,-0.107820,2,1,1,2


# Splitting the data to train and test

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Various Classification Algorithms

In [26]:
clf1 = LogisticRegression(max_iter = 500)
clf2 = SVC()
clf3 = tree.DecisionTreeClassifier(criterion='gini')
clf4 = KNeighborsClassifier(n_neighbors=19)
clf5 = GaussianNB()
clf6 = RandomForestClassifier(criterion='entropy', n_estimators=20)
clf7 = AdaBoostClassifier(n_estimators=600,learning_rate=0.001, random_state=1)

In [27]:
models = [clf1,clf2,clf3,clf4,clf5,clf6,clf7]
accuracy ={}
for i in models:
    i.fit(x_train,y_train)
    accuracy[i] = i.score(x_test,y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  i.fit(x_train,y_train)
  y = column_or_1d(y, warn=True)


In [28]:
accuracy


{LogisticRegression(max_iter=500): 0.796875,
 SVC(): 0.796875,
 DecisionTreeClassifier(): 0.609375,
 KNeighborsClassifier(n_neighbors=19): 0.796875,
 GaussianNB(): 0.796875,
 RandomForestClassifier(criterion='entropy', n_estimators=20): 0.78125,
 AdaBoostClassifier(learning_rate=0.001, n_estimators=600, random_state=1): 0.796875}

# Prediction of testing data

In [29]:
clf6_pre=clf6.predict(x_test)

In [30]:
clf6_pre

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

# Checking the accuracy Score

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test,clf6_pre)

0.78125

# Plotting the Confusion Matrix

In [35]:
from sklearn.metrics import confusion_matrix

In [44]:
confusion_matrix(y_test,clf6_pre)

array([[50,  1],
       [13,  0]])