In [321]:
import pandas as pd

# Thyroid Cancer Prediction

In [322]:
df = pd.read_csv('thyroid_cancer_risk_data.csv')

In [323]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,1,66,Male,Russia,Caucasian,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low,Benign
1,2,29,Male,Germany,Hispanic,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low,Benign
2,3,86,Male,Nigeria,Caucasian,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low,Benign
3,4,75,Female,India,Asian,No,No,No,No,No,No,4.1,2.62,11.04,2.46,Medium,Benign
4,5,35,Female,Germany,African,Yes,Yes,No,No,No,No,9.1,2.11,10.71,2.11,High,Benign


In [324]:
df = df.drop('Patient_ID', axis=1)

In [325]:
df['Diagnosis'].value_counts()

Diagnosis
Benign       163196
Malignant     49495
Name: count, dtype: int64

# Encode target column

In [326]:
df['Diagnosis'] = df['Diagnosis'].map({'Benign': 0, 'Malignant': 1})

In [327]:
df

Unnamed: 0,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,66,Male,Russia,Caucasian,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low,0
1,29,Male,Germany,Hispanic,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low,0
2,86,Male,Nigeria,Caucasian,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low,0
3,75,Female,India,Asian,No,No,No,No,No,No,4.10,2.62,11.04,2.46,Medium,0
4,35,Female,Germany,African,Yes,Yes,No,No,No,No,9.10,2.11,10.71,2.11,High,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212686,58,Female,India,Asian,No,No,No,No,Yes,No,2.00,0.64,11.92,1.48,Low,0
212687,89,Male,Japan,Middle Eastern,No,No,No,No,Yes,No,9.77,3.25,7.30,4.46,Medium,0
212688,72,Female,Nigeria,Hispanic,No,No,No,No,No,Yes,7.72,2.44,8.71,2.36,Medium,0
212689,85,Female,Brazil,Middle Eastern,No,No,No,No,No,Yes,5.62,2.53,9.62,1.54,Medium,0


# Data preprocessing

In [328]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [329]:
ordinal_features_yes_no = ['Family_History', 'Radiation_Exposure', 'Iodine_Deficiency', 'Smoking', 'Obesity', 'Diabetes']
ordinal_features_cancer_risk = ["Thyroid_Cancer_Risk"]
one_hot_features = ["Gender", "Country", "Ethnicity"]

In [330]:
cat_features = ordinal_features_yes_no + one_hot_features

In [331]:
yes_no_categories = [["No", "Yes"]] * len(ordinal_features_yes_no)
cancer_risk_categories = [['Low', 'Medium', 'High']]

In [332]:
ordinal_transformer_yes_no = OrdinalEncoder(categories=yes_no_categories)
ordinal_transformer_cancer_risk = OrdinalEncoder(categories=cancer_risk_categories)
one_hot_transformer = OneHotEncoder(handle_unknown='ignore')

In [333]:
df.drop(cat_features, axis=1).columns.values

array(['Age', 'TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size',
       'Thyroid_Cancer_Risk', 'Diagnosis'], dtype=object)

In [334]:
num_cols = ['Age', 'TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']

In [335]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal_yes_no', ordinal_transformer_yes_no, ordinal_features_yes_no),
    ('ordinal_cancer_risk', ordinal_transformer_cancer_risk, ordinal_features_cancer_risk),
    ('onehot', one_hot_transformer, one_hot_features),
    ('num', 'passthrough', num_cols)
])

# Model Building & Validation

In [336]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [337]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())]
)

In [338]:
X = df.drop('Diagnosis', axis=1)
y = df['Diabetes']

In [339]:
cross_val_score(pipeline, X, y)

array([1., 1., 1., 1., 1.])

In [340]:
import joblib

# Saving Model

In [341]:
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']