In [21]:
pip install plotly==5.7.0

Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import numpy as np
import plotly.express as px #provides functions to visualize a variety of types of data
from sklearn.model_selection import train_test_split# aids in data prediction
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC#SVC is a C-support vector classification whose implementation is based on libsvm. 


In [23]:
data= pd.read_csv("BRCA.csv")
data.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [24]:
#testing data set for null values
print(data.isnull().sum())

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64


In [25]:
#since the data set has some null values, we need to drop them
data = data.dropna()

In [26]:
#getting insights about the columns of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float

In [27]:
#Breast cancer is common in females. Let's see how many females have it
print(data["Gender"].value_counts())

FEMALE    313
MALE        4
Name: Gender, dtype: int64


In [28]:
#Age groups of patient
data.loc[data['Age'].between(20,40), 'age_group'] ='Below 40'
data.loc[data['Age'].between(39,50), 'age_group'] = '40-49'
data.loc[data['Age'].between(49,60), 'age_group'] = '50-59'
data.loc[data['Age'].between(59,70), 'age_group'] = '60-69'
data.loc[data['Age'].between(69,80), 'age_group'] = '70-79'
data.loc[data['Age'].between(79,90), 'age_group'] = '80 and above'
print(data['age_group'].value_counts().sort_values(ascending=False))#it is common among the 50-60 age group


50-59           88
60-69           87
40-49           61
70-79           43
80 and above    26
Below 40        12
Name: age_group, dtype: int64


In [29]:
#Age of patient
Age=data['age_group'].value_counts()
Age2= Age.index 
number_of_patients= Age.values 
figure = px.pie(data, 
                values=number_of_patients, 
               names=Age2,hole = 0.5, 
             title="Age of patients") #piechart showing age groups of patients
figure.show()



In [30]:
#Receptor Status
#When you are PR positive, your cancer cells grow in response to the hormone progesterone.All the patients are PR positive
print(data['PR status'].value_counts())
#When you are ER positive, your cancer cells grow in response to the hormone estrogen.All the patients are ER positive
print(data['ER status'].value_counts())
#When you are HER2 positive, your cancer cells grow in response to the protein human epidermal growth factor 2.                                                                                                                                                                                                                                                                                                                                                                      
print(data['HER2 status'].value_counts())


Positive    317
Name: PR status, dtype: int64
Positive    317
Name: ER status, dtype: int64
Negative    288
Positive     29
Name: HER2 status, dtype: int64


In [31]:
#Which patients are HER2 positive? They grow aggresively
data.loc[data['HER2 status']== 'Positive', 'Prognosis'] ='Poor Prognosis'
data.loc[data['HER2 status']== 'Negative', 'Prognosis'] ='Good Prognosis'
pd.options.display.max_rows = 400
print(data['Prognosis']) 

0      Good Prognosis
1      Good Prognosis
2      Good Prognosis
3      Good Prognosis
4      Good Prognosis
5      Good Prognosis
6      Good Prognosis
8      Good Prognosis
9      Poor Prognosis
10     Good Prognosis
11     Good Prognosis
12     Good Prognosis
13     Good Prognosis
14     Good Prognosis
15     Good Prognosis
16     Good Prognosis
17     Good Prognosis
18     Good Prognosis
19     Good Prognosis
20     Good Prognosis
21     Good Prognosis
23     Good Prognosis
24     Good Prognosis
25     Poor Prognosis
26     Good Prognosis
27     Good Prognosis
28     Good Prognosis
29     Poor Prognosis
30     Good Prognosis
31     Good Prognosis
32     Good Prognosis
33     Good Prognosis
34     Good Prognosis
35     Good Prognosis
36     Good Prognosis
37     Good Prognosis
38     Good Prognosis
39     Good Prognosis
40     Good Prognosis
41     Good Prognosis
42     Good Prognosis
43     Good Prognosis
44     Good Prognosis
45     Good Prognosis
46     Good Prognosis
47     Goo

In [32]:
#Tumour Stage
stage=data['Tumour_Stage'].value_counts() #vertical rep of the stages and their values
which_stage= stage.index #returns the indexes of the stages(II,III,I)
number_of_patients= stage.values #returns the values of the stages (180,77,60)
figure = px.pie(data, 
                values=number_of_patients, 
             names=which_stage,hole = 0.5, 
             title="Tumour Stages of Patients") #piechart showing the percentages of the patients with each tumor stage
figure.show()#most patients have stage 2 Ca



In [33]:
#Histology
histology=data['Histology'].value_counts() 
which_histology= histology.index #returns the indexes of the histology
number_of_patients= histology.values #returns the values of the histology
figure = px.pie(data, 
                values=number_of_patients, 
             names=which_histology,hole = 0.5, 
             title="Histology of Patients") #piechart showing the percentages of the patients with each tumor histology
figure.show()#Most patients have infiltrating ductal carcinoma



In [34]:
#Surgery Type
surgery=data['Surgery_type'].value_counts() 
surgery_type= surgery.index 
number_of_patients= surgery.values 
figure = px.pie(data, 
                values=number_of_patients, 
             names=surgery_type,hole = 0.5, 
             title="Surgery Performed") #piechart showing the types of surgeries performed
figure.show()#Most patients have other surgeries



In [35]:
#Categorical Variable Transformation: is turning a categorical variable to a numeric variable. Categorical variable transformation is mandatory for most of the machine learning models because they can handle only numeric values
data["Tumour_Stage"] = data["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
data["Histology"] = data["Histology"].map({"Infiltrating Ductal Carcinoma": 1, 
                                           "Infiltrating Lobular Carcinoma": 2, "Mucinous Carcinoma": 3})
data["ER status"] = data["ER status"].map({"Positive": 1})
data["PR status"] = data["PR status"].map({"Positive": 1})
data["HER2 status"] = data["HER2 status"].map({"Positive": 1, "Negative": 2})
data["Gender"] = data["Gender"].map({"MALE": 1, "FEMALE": 2})
data["Surgery_type"] = data["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2, 
                                                 "Lumpectomy": 3, "Simple Mastectomy": 4})
print(data.head())



     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0       2  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0       2 -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0       2  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0       2  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0       2  0.221550   1.90680   0.52045 -0.311990   

   Tumour_Stage  Histology  ER status  PR status  HER2 status  Surgery_type  \
0             3          1          1          1            2             2   
1             2          3          1          1            2             3   
2             3          1          1          1            2             1   
3             2          1          1          1            2             2   
4             2          1          1          1            2             1   

  Date_of_Surgery Date_of_Last_Visit Patient_Status age_group       Prognosis  
0     

In [38]:
#splitting data
x = np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 
                   'Tumour_Stage', 'Histology', 'ER status', 'PR status', 
                   'HER2 status', 'Surgery_type']])
y = np.array(data[['Patient_Status']])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.60, random_state=42)

In [53]:
#Now here’s how we can train a machine learning model:
model = SVC()
model.fit(xtrain,ytrain.ravel())

SVC()

In [54]:
#predicting whether patient will be dead or alive
features = np.array([[69.0, 2, 0.21398,1.3114, -0.32747, -0.23426, 3, 1, 1, 1, 2, 4]])
print(model.predict(features))

['Alive']


In [61]:
#Testing the accuracy of the model
y_pred=model.predict(xtest)
print(accuracy_score(ytest,y_pred)*100)

81.67539267015707
