In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [58]:
data = pd.read_csv('../data/The_Cancer_data_1500_V2.csv')
data.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


In [3]:
data.dtypes

Age                   int64
Gender                int64
BMI                 float64
Smoking               int64
GeneticRisk           int64
PhysicalActivity    float64
AlcoholIntake       float64
CancerHistory         int64
Diagnosis             int64
dtype: object

In [4]:
## print shape of dataset with rows and columns
print(data.shape)

(1500, 9)


In [5]:
# See the column data types and non-missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1500 non-null   int64  
 1   Gender            1500 non-null   int64  
 2   BMI               1500 non-null   float64
 3   Smoking           1500 non-null   int64  
 4   GeneticRisk       1500 non-null   int64  
 5   PhysicalActivity  1500 non-null   float64
 6   AlcoholIntake     1500 non-null   float64
 7   CancerHistory     1500 non-null   int64  
 8   Diagnosis         1500 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 105.6 KB


In [6]:
data.describe()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,50.32,0.490667,27.513321,0.269333,0.508667,4.897929,2.417987,0.144,0.371333
std,17.640968,0.50008,7.230012,0.443761,0.678895,2.866162,1.419318,0.351207,0.483322
min,20.0,0.0,15.000291,0.0,0.0,0.00241,0.001215,0.0,0.0
25%,35.0,0.0,21.483134,0.0,0.0,2.434609,1.210598,0.0,0.0
50%,51.0,0.0,27.598494,0.0,0.0,4.834316,2.382971,0.0,0.0
75%,66.0,1.0,33.850837,1.0,1.0,7.409896,3.585624,0.0,1.0
max,80.0,1.0,39.958688,1.0,2.0,9.994607,4.987115,1.0,1.0


In [7]:
import pandas as pd
from ydata_profiling import ProfileReport

# Load your dataset
df = pd.read_csv("../data/The_Cancer_data_1500_V2.csv")

# Generate report
profile = ProfileReport(df, title="Cancer Prediction Data Report", explorative=True)
profile.to_file("cancer_report.html")

  from .autonotebook import tqdm as notebook_tqdm


100%|██████████| 9/9 [00:00<00:00, 1868.93it/s]<00:00, 27.01it/s, Describe variable: Diagnosis]
Summarize dataset: 100%|██████████| 34/34 [00:01<00:00, 19.61it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Export report to file: 100%|██████████| 1/1 [00:00<?, ?it/s]


In [8]:
num_cols = data.select_dtypes(include='number').columns
cat_cols = data.select_dtypes(include='object').columns

print(num_cols)
print(cat_cols)

Index(['Age', 'Gender', 'BMI', 'Smoking', 'GeneticRisk', 'PhysicalActivity',
       'AlcoholIntake', 'CancerHistory', 'Diagnosis'],
      dtype='object')
Index([], dtype='object')


In [13]:
data['AgeCat'] = pd.cut(data['Age'], bins=[-np.inf, 18, 30, 45, np.inf], labels=['child', 'young', 'middle-aged', 'aged'])
data

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis,AgeCat
0,58,1,16.085313,0,1,8.146251,4.148219,1,1,aged
1,71,0,30.828784,0,1,9.361630,3.519683,0,0,aged
2,48,1,38.785084,0,2,5.135179,4.728368,0,1,aged
3,34,0,30.040296,0,0,9.502792,2.044636,0,0,middle-aged
4,62,1,35.479721,0,0,5.356890,3.309849,0,1,aged
...,...,...,...,...,...,...,...,...,...,...
1495,62,1,25.090025,0,0,9.892167,1.284158,0,1,aged
1496,31,0,33.447125,0,1,1.668297,2.280636,1,1,middle-aged
1497,63,1,32.613861,1,1,0.466848,0.150101,0,1,aged
1498,55,0,25.568216,0,0,7.795317,1.986138,1,1,aged


In [18]:
print(data.iloc[56])

Age                        28
Gender                      1
BMI                 16.735596
Smoking                     0
GeneticRisk                 1
PhysicalActivity     1.266371
AlcoholIntake        1.600906
CancerHistory               0
Diagnosis                   0
AgeCat                  young
Name: 56, dtype: object


In [19]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=7, stratify=data['AgeCat'])

In [20]:
print(train['AgeCat'].value_counts(normalize=True) * 100)
print(test['AgeCat'].value_counts(normalize=True) * 100)

AgeCat
aged           58.250000
middle-aged    23.916667
young          17.833333
child           0.000000
Name: proportion, dtype: float64
AgeCat
aged           58.333333
middle-aged    24.000000
young          17.666667
child           0.000000
Name: proportion, dtype: float64


In [21]:
train_set, val_set = train_test_split(train, test_size=0.2, random_state=7, stratify=train['AgeCat'])

train_set.drop(columns=['AgeCat'], axis=1, inplace= True)
val_set.drop(columns=['AgeCat'], axis=1, inplace=True)

In [22]:
X_train = train_set.drop(columns=['Diagnosis'])
y_train = train_set['Diagnosis']

X_val = val_set.drop(columns=['Diagnosis'])
y_val = val_set['Diagnosis']

In [23]:
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(include='object').columns

In [31]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
#X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])

X_val[num_cols] = num_imputer.transform(X_val[num_cols])
#X_val[cat_cols] = cat_imputer.transform(X_val[cat_cols])

In [32]:
# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data['Age'].quantile(0.25)
Q3 = data['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]
print(outliers)

Empty DataFrame
Columns: [Age, Gender, BMI, Smoking, GeneticRisk, PhysicalActivity, AlcoholIntake, CancerHistory, Diagnosis, AgeCat]
Index: []


In [33]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

scaler = StandardScaler()
encoder = OrdinalEncoder()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val[cat_cols] = encoder.transform(X_val[cat_cols])

In [54]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

In [55]:
log_reg.score(X_val, y_val)

0.85

In [56]:
pred = log_reg.predict(X_val)

In [48]:
# Build a model using RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf.score(X_val, y_val))

0.9416666666666667


In [41]:
# Build a model using GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(gb.score(X_val, y_val))

0.925


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import numpy as np

# Train the model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_val)

# Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9416666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95       154
           1       0.93      0.91      0.92        86

    accuracy                           0.94       240
   macro avg       0.94      0.93      0.94       240
weighted avg       0.94      0.94      0.94       240



# **Model Comparison and Selection**

**After comparing the performance of Logistic Regression, Random Forest Classifier, and Gradient Boosting Classifier, we observed that the Random Forest Classifier yielded the highest score. Therefore, the Random Forest Classifier is selected as the best model.**