# Brief
Our project is on the Haberman Survival Dataset. It contains cases from a study conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast
cancer.

Our primary task is to find the survival status of patience in 5 years of their surgery.

In [None]:
#To perform certain functions and process our data, we need to import the following libraries:
#Pandas: This is a data analysis library; it can be used to read and edit files, as well as manipulate and interpret data.
#Numpy: This is the Numerical Python library. It is also a data analysis library, but processes numerical data.
#Matplotlib: This is a data visualization library. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#The code below is performed to read our dataset for this task, i.e to access and view our data on the notebook, and add labels to the columns.

In [None]:
Data = pd.read_csv(r'haberman.csv', names=['age', 'operation_year', 'axil_nodes', 'survival_status'])
Data

Unnamed: 0,age,operation_year,axil_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [None]:
#The next line of code is used to call the first five rows of data off our dataset.
Data.head()

Unnamed: 0,age,operation_year,axil_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [None]:
#The next line of code is used to call the bottom five rows of data off our dataset.
Data.tail()

Unnamed: 0,age,operation_year,axil_nodes,survival_status
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2
305,83,58,2,2


In [None]:
#Shape is used to ascertain the number of columns and rows in our dataset. 
Data.shape

(306, 4)

In [None]:
#Getting a summary of our dataset
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              306 non-null    int64
 1   operation_year   306 non-null    int64
 2   axil_nodes       306 non-null    int64
 3   survival_status  306 non-null    int64
dtypes: int64(4)
memory usage: 9.7 KB


In [None]:
#The result from the summary of our dataset indicates the following; 
#class of data is identified as pandas(this is because Pandas is the library we used to carry out all operations on the dataset so far)
#the range index indicates the total number of entries(rows) on the dataset. We have 306 entries numbered from 0-305.
#Data column gives the total number of columns on our dataset, i.e 4.
#Non-null count checks for null values in the dataset and finds that there are none.
#The data type is identified as integers, i.e the data are numerical values.
#Finally, it shows how much memory space the dataset uses.

In [None]:
#describe shows the statistical analysis of our dataset; it establishes the number of entries in the dataset, the mean, standard deviation, minimum and maximum entries, and the 25th, 50th and 75th percentile of our data.
Data.describe()

Unnamed: 0,age,operation_year,axil_nodes,survival_status
count,306.0,306.0,306.0,306.0
mean,52.457516,62.852941,4.026144,1.264706
std,10.803452,3.249405,7.189654,0.441899
min,30.0,58.0,0.0,1.0
25%,44.0,60.0,0.0,1.0
50%,52.0,63.0,1.0,1.0
75%,60.75,65.75,4.0,2.0
max,83.0,69.0,52.0,2.0


In [None]:
#Checking for null values in all columns.
Data.isnull().sum()

age                0
operation_year     0
axil_nodes         0
survival_status    0
dtype: int64

# Data Cleaning

In [None]:
#Checking for the number of duplicate entries in our dataset.
Data.duplicated().sum()

17

In [None]:
#Calling all duplicate data entries in our dataset.
Data[Data.duplicated()==True]

Unnamed: 0,age,operation_year,axil_nodes,survival_status
23,37,63,0,1
27,38,60,0,1
52,41,65,0,1
65,43,64,0,2
78,44,61,0,1
108,48,58,11,2
132,50,61,0,1
179,54,62,0,1
184,55,58,1,1
196,56,60,0,1


In [None]:
#Create a new variable name and clean data by dropping/removing all 17 duplicate entries.
Data1 = Data.drop_duplicates()
Data1

Unnamed: 0,age,operation_year,axil_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [None]:
#Ascertaining the current number of columns and rows (entries) in our dataset after dropping duplicate entries.
Data1.shape

(289, 4)

# Manual Encoding 

In [None]:
#explain the code below
Data1['survival_status']=Data1['survival_status'].replace(2,0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Data1['survival_status']=Data1['survival_status'].replace(2,0)


In [None]:
Data1

Unnamed: 0,age,operation_year,axil_nodes,survival_status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,0
