In [None]:
# Setup
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("Setup Complete")

Setup Complete


In [None]:
# reading data
# cancer_file_path = '/content/bcsc_risk_factors_summarized1_092020.csv'
cancer_file_path = 'bcsc_risk_factors_summarized1_092020 (1).csv'
cancer_data = pd.read_csv(cancer_file_path)
cancer_data.columns        # viewing columns of data

Index(['Year', 'Age_group', 'Race', 'family_history', 'age_menarche',
       'age_first_birth', 'BIRADS_breast_density', 'current_hrt', 'menopaus',
       'bmi_group', 'biophx', 'breast_cancer', 'count'],
      dtype='object')

In [None]:
cancer_data = cancer_data.drop(columns =['Year','BIRADS_breast_density','count'])    # dropping columns irrelevant to us
cancer_data.columns


Index(['Age_group', 'Race', 'family_history', 'age_menarche',
       'age_first_birth', 'current_hrt', 'menopaus', 'bmi_group', 'biophx',
       'breast_cancer'],
      dtype='object')

In [None]:
# removing rows with unknown i.e. missing data (the dataset documentation says that unknown data is repalced with 9)
cancer_data = cancer_data[
    (cancer_data['family_history'] != 9) &
    (cancer_data['age_menarche'] != 9) &
    (cancer_data['age_first_birth'] != 9) &
    (cancer_data['menopaus'] != 9) &
    (cancer_data['bmi_group'] != 9) &
    (cancer_data['breast_cancer'] != 9) &
    (cancer_data['Race'] != 9) &
    (cancer_data['current_hrt'] != 9) &
    (cancer_data['biophx'] != 9)
]
cancer_data.head(10)

Unnamed: 0,Age_group,Race,family_history,age_menarche,age_first_birth,current_hrt,menopaus,bmi_group,biophx,breast_cancer
1411,7,1,1,0,0,0,2,3,0,0
1412,7,1,1,0,0,0,2,4,1,0
1413,7,1,1,0,0,0,3,2,1,0
1414,7,1,1,0,0,0,3,4,0,0
1415,7,1,1,0,0,0,1,3,0,0
1416,7,1,1,0,0,0,1,4,1,1
1417,7,1,1,0,0,0,2,1,0,0
1418,7,1,1,0,0,0,2,1,1,0
1419,7,1,1,0,0,0,2,2,0,0
1420,7,1,1,0,0,0,2,2,1,1


In [None]:
y = cancer_data['breast_cancer']                            # defining target variables

X = cancer_data.drop(columns=['breast_cancer'])          # defining feature set for the model
print(X.shape)
print(y.shape)

(142854, 9)
(142854,)


In [None]:
y.head(10)

Unnamed: 0,breast_cancer
1411,0
1412,0
1413,0
1414,0
1415,0
1416,1
1417,0
1418,0
1419,0
1420,1


In [None]:
X.describe()

Unnamed: 0,Age_group,Race,family_history,age_menarche,age_first_birth,current_hrt,menopaus,bmi_group,biophx
count,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0
mean,7.571612,2.328132,0.317163,0.951734,1.960652,0.040433,1.874326,2.287188,0.411028
std,2.61683,1.572885,0.465373,0.756891,1.384847,0.196973,0.590045,1.084883,0.492022
min,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,6.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0
50%,7.0,2.0,0.0,1.0,2.0,0.0,2.0,2.0,0.0
75%,9.0,3.0,1.0,2.0,3.0,0.0,2.0,3.0,1.0
max,13.0,6.0,1.0,2.0,4.0,1.0,3.0,4.0,1.0


In [None]:
def mse(preds,val_y):
  '''
  returns the mean absolute error of the model.
  Parameters: preds- predictions of the model
  val_y- value of y from dataset
  '''
  return mean_squared_error(preds,val_y)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(X, y, random_state = 42, test_size=0.2)

def evaluate_model(model, val_x, val_y):
    '''
    returns the mean absolute error of the model.
    Parameters: val_x - test_data
    val_y- value of y from dataset
    model - modle used
    '''
    preds = model.predict(val_x)
    mse_score = mean_squared_error(val_y, preds)
    print(f"{model.__class__.__name__} Performance:")
    print(f"Mean Squared Error: {mse_score}")
    print("-" * 30)

In [None]:
tree_model = DecisionTreeRegressor(random_state = 42)
tree_model.fit(train_x, train_y)
# tree_preds = tree_model.predict(val_x)
# mae(tree_preds, val_y)
evaluate_model(tree_model, val_x, val_y)

DecisionTreeRegressor Performance:
Mean Squared Error: 0.13821184980359558
------------------------------


In [None]:
# XGBoost Regressor
xgb_model = XGBRegressor(random_state=42, n_estimators=300, learning_rate=0.065)
xgb_model.fit(train_x, train_y)
evaluate_model(xgb_model, val_x, val_y)


XGBRegressor Performance:
Mean Squared Error: 0.12828059494495392
------------------------------


In [None]:
X.describe()

Unnamed: 0,Age_group,Race,family_history,age_menarche,age_first_birth,current_hrt,menopaus,bmi_group,biophx
count,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0,142854.0
mean,7.571612,2.328132,0.317163,0.951734,1.960652,0.040433,1.874326,2.287188,0.411028
std,2.61683,1.572885,0.465373,0.756891,1.384847,0.196973,0.590045,1.084883,0.492022
min,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,6.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0
50%,7.0,2.0,0.0,1.0,2.0,0.0,2.0,2.0,0.0
75%,9.0,3.0,1.0,2.0,3.0,0.0,2.0,3.0,1.0
max,13.0,6.0,1.0,2.0,4.0,1.0,3.0,4.0,1.0


In [None]:
final_model = xgb_model