# Loading Libraries and Dataset

In [217]:
# Installing and loading libraries
import pandas as pd
import numpy as np

# Train test split
from sklearn.model_selection import train_test_split

# Tree-based feature importance
from sklearn.ensemble import RandomForestClassifier

# Recursive feature elimination with logistic regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Select k-best
from sklearn.feature_selection import SelectKBest, chi2

In [218]:
# Loading the preprocessed CSV dataset
df = pd.read_csv("Dataset/PreprocessedDataset.csv")

# Preparing Data

In [219]:
# Viewing all columns in the dataframe
df.columns

Index(['id', 'age', 'menarche', 'menopause', 'agefirst', 'breastfeeding',
       'biopsies', 'year', 'imc', 'weight', 'exercise', 'nrelbc_mother',
       'nrelbc_sister', 'nrelbc_daughter', 'nrelbc_cousin', 'nrelbc_aunt',
       'nrelbc_grandma', 'nrelbc_absent', 'rhinitis_allergy',
       'medicines_allergy', 'laryngitis_allergy', 'dermatitis_allergy',
       'other_allergy', 'no_allergy', 'histologicalclass', '1_children',
       '2_children', '3_children', '4_children', '5_children',
       'more_than_5_children', 'hyperplasia', 'is_mixed_race', 'is_white_race',
       'consumed_alcohol', 'consumed_tobacco', 'is_sad', 'is_depressive',
       'cancer', 'birads'],
      dtype='object')

In [220]:
# Selecting non-label features
features = df.drop(columns = ["cancer", "id"]).columns.tolist()

In [221]:
# Train test split
X = df[features]
y = df["cancer"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Feature Selection

## Tree-Based Feature Importance
Using random forest classifier

In [222]:
# Initializing the model
model1 = RandomForestClassifier()

In [223]:
# Model fitting
model1.fit(X_train, y_train)
importances = model1.feature_importances_

In [224]:
# Viewing the score value of each feature
feature_importances = pd.Series(importances, index = X_train.columns)
print(feature_importances.sort_values(ascending = False))

biopsies                0.401438
histologicalclass       0.110461
is_sad                  0.061612
year                    0.057895
consumed_alcohol        0.051698
consumed_tobacco        0.049053
hyperplasia             0.046172
breastfeeding           0.038801
birads                  0.031298
weight                  0.021748
other_allergy           0.016843
nrelbc_mother           0.015829
age                     0.013824
nrelbc_sister           0.011788
is_depressive           0.010499
imc                     0.009259
nrelbc_absent           0.009093
menopause               0.008869
agefirst                0.004302
menarche                0.004111
2_children              0.003334
nrelbc_cousin           0.003310
nrelbc_aunt             0.003146
nrelbc_grandma          0.002847
exercise                0.002405
3_children              0.002373
1_children              0.001250
nrelbc_daughter         0.001021
is_white_race           0.000959
no_allergy              0.000949
is_mixed_r

## Recursive Feature Elimination
Using logistic regression

In [225]:
# Initializing the model
model2 = LogisticRegression(max_iter = 9999)

In [226]:
# Model fitting
rfe = RFE(model2, n_features_to_select = 5)
X_train_rfe = rfe.fit_transform(X_train, y_train)

In [227]:
# Viewing the features
print("Selected features: ", X_train.columns[rfe.get_support()])

Selected features:  Index(['biopsies', 'other_allergy', '2_children', '3_children', 'is_sad'], dtype='object')


## Select K-Best

In [228]:
# Applying select k-best using chi2
select_k_best = SelectKBest(score_func = chi2, k = 5)
X_train_k_best = select_k_best.fit_transform(X_train, y_train)

In [229]:
# Viewing the features
print("Selected features: ", X_train.columns[select_k_best.get_support()])

Selected features:  Index(['age', 'menopause', 'agefirst', 'breastfeeding', 'biopsies'], dtype='object')
