In [83]:
import pandas as pd
import numpy as np

In [84]:
df = pd.read_csv('/kaggle/input/breast-cancer/Breast_Cancer.csv')

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

In [86]:
df.isnull().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

In [87]:
unique_values_dict = {}
df_columns = df.columns
# print(df_columns)
for column in df_columns:
    unique_values_dict[column] = df[column].unique()
print(unique_values_dict)

{'Age': array([68, 50, 58, 47, 51, 40, 69, 46, 65, 48, 62, 61, 56, 43, 60, 57, 55,
       63, 66, 53, 59, 54, 49, 64, 42, 37, 67, 31, 52, 33, 45, 38, 39, 36,
       41, 44, 32, 34, 35, 30]), 'Race': array(['White', 'Black', 'Other'], dtype=object), 'Marital Status': array(['Married', 'Divorced', 'Single ', 'Widowed', 'Separated'],
      dtype=object), 'T Stage ': array(['T1', 'T2', 'T3', 'T4'], dtype=object), 'N Stage': array(['N1', 'N2', 'N3'], dtype=object), '6th Stage': array(['IIA', 'IIIA', 'IIIC', 'IIB', 'IIIB'], dtype=object), 'differentiate': array(['Poorly differentiated', 'Moderately differentiated',
       'Well differentiated', 'Undifferentiated'], dtype=object), 'Grade': array(['3', '2', '1', ' anaplastic; Grade IV'], dtype=object), 'A Stage': array(['Regional', 'Distant'], dtype=object), 'Tumor Size': array([  4,  35,  63,  18,  41,  20,   8,  30, 103,  32,  13,  59,  15,
        19,  46,  24,  25,  29,  40,  70,  22,  50,  17,  21,  10,  27,
        23,   5,  51,   9,  55

In [88]:
df.dtypes


Age                        int64
Race                      object
Marital Status            object
T Stage                   object
N Stage                   object
6th Stage                 object
differentiate             object
Grade                     object
A Stage                   object
Tumor Size                 int64
Estrogen Status           object
Progesterone Status       object
Regional Node Examined     int64
Reginol Node Positive      int64
Survival Months            int64
Status                    object
dtype: object

In [89]:
df['Status'] = df['Status'].str.strip()
df['Status'] = df['Status'].map({'Alive': 0, 'Dead': 1})
df['Status'].unique()

array([0, 1])

In [90]:
df['Grade'] = df['Grade'].str.replace(' anaplastic; Grade IV', '4')
df['Grade'] = pd.to_numeric(df['Grade'],)
df['Grade'].unique()

array([3, 2, 1, 4])

In [91]:
columns_to_encode = ['Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'A Stage', 'Estrogen Status',
       'Progesterone Status']

unique_values_of_columns = {}
for column in columns_to_encode:
    unique_values_of_columns[column] = df[column].unique()
    
columns_values_list = []
encodings = {}
for column_name, unique_values in unique_values_of_columns.items(): 
    columns_values_list.append((column_name, unique_values))
for column_name, values in columns_values_list:
    encodings[column_name] = {value: idx for idx, value in enumerate(values, start=1)}

for encoding in encodings:
    df[encoding] = df[encoding].map(encodings[encoding]) 
df

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,1,1,1,1,1,1,3,1,4,1,1,24,1,60,0
1,50,1,1,2,2,2,2,2,1,35,1,1,14,5,62,0
2,58,1,2,3,3,3,2,2,1,63,1,1,14,7,75,0
3,58,1,1,1,1,1,1,3,1,18,1,1,2,1,84,0
4,47,1,1,2,1,4,1,3,1,41,1,1,3,1,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,62,3,1,1,1,1,2,2,1,9,1,1,1,1,49,0
4020,56,1,2,2,2,2,2,2,1,46,1,1,14,8,69,0
4021,68,1,1,2,1,4,2,2,1,22,1,2,11,3,69,0
4022,58,2,2,2,1,4,2,2,1,44,1,1,11,1,72,0


## Training data

In [92]:
from sklearn.model_selection import train_test_split
X = df[['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Reginol Node Positive']]
y = df['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

## Standardizing data

In [93]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Running Logistic Regression on scaled data

In [94]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression()
model.fit(X_train, y_train)

## Evaluating the model

In [95]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score 
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy Score: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")

Accuracy Score: 0.8546583850931677
Precision: 0.5135135135135135
F1 Score: 0.24516129032258063
Recall: 0.16101694915254236
