In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [8]:
train = pd.read_csv('train.csv').drop(['id'], axis = 1)
test = pd.read_csv('test.csv').drop(['id'], axis = 1)
train.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,55.0,170.0,80.0,92.0,1.2,0.8,1.0,1.0,129.0,74.0,...,49.0,114.0,15.4,1.0,0.9,20.0,23.0,13.0,0.0,0.0
1,30.0,170.0,80.0,94.0,1.5,1.5,1.0,1.0,128.0,84.0,...,51.0,111.0,16.2,1.0,0.9,30.0,39.0,92.0,0.0,1.0
2,45.0,170.0,75.0,84.0,1.0,1.0,1.0,1.0,124.0,80.0,...,52.0,112.0,14.6,2.0,0.9,20.0,20.0,50.0,0.0,1.0
3,55.0,150.0,55.0,85.0,0.9,0.5,1.0,1.0,123.0,79.0,...,61.0,119.0,13.4,1.0,0.8,25.0,20.0,18.0,0.0,0.0
4,45.0,160.0,55.0,72.0,0.5,0.6,1.0,1.0,117.0,76.0,...,61.0,120.0,13.9,1.0,0.7,20.0,26.0,10.0,0.0,0.0


In [9]:
def create_features(df):
    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)
    df['HW_Ratio'] = df['height(cm)'] / df['waist(cm)']
    df['HA_Ratio'] = df['height(cm)'] / df['age'] 
    df["hemoglobin_height"] = df["hemoglobin"] * df["height(cm)"]
    df["hemoglobin / Gtp"] = df["hemoglobin"] / df["Gtp"] 
    df['cholesterol_ratio'] = df['HDL'] / df['LDL']
    df['blood_pressure_category'] = pd.cut(df['systolic'], bins=[0, 120, 140, np.inf], labels=[0, 1, 2])
    df['waist_height_ratio'] = df['waist(cm)'] / df['height(cm)']
    df['Liver_Enzyme_Ratio'] = df['AST'] / df['ALT']
    
    return df

create_features(train)
create_features(test).head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,dental caries,BMI,HW_Ratio,HA_Ratio,hemoglobin_height,hemoglobin / Gtp,cholesterol_ratio,blood_pressure_category,waist_height_ratio,Liver_Enzyme_Ratio
0,40.0,175.0,70.0,84.0,1.5,1.5,1.0,1.0,120.0,59.0,...,0.0,22.857143,2.083333,4.375,2800.0,0.64,0.412844,0,0.48,1.666667
1,45.0,155.0,55.0,72.4,0.6,0.5,1.0,1.0,102.0,68.0,...,0.0,22.89282,2.140884,3.444444,1782.5,0.884615,0.513761,0,0.467097,1.4
2,40.0,160.0,55.0,76.0,1.2,1.2,1.0,1.0,115.0,64.0,...,0.0,21.484375,2.105263,4.0,2128.0,1.209091,1.040541,0,0.475,1.4
3,45.0,150.0,50.0,74.4,1.0,1.0,1.0,1.0,96.0,67.0,...,0.0,22.222222,2.016129,3.333333,2010.0,0.744444,1.036585,0,0.496,1.533333
4,35.0,185.0,80.0,90.0,0.8,1.0,1.0,1.0,113.0,73.0,...,0.0,23.374726,2.055556,5.285714,2886.0,0.78,0.561905,0,0.486486,0.615385


In [10]:
X = train.drop(['smoking'], axis = 1)
y = train['smoking'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_pipeline = Pipeline([
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(max_iter=500, random_state=42, n_jobs=-1))
])

stacking_model = StackingClassifier(
    estimators=[
        ('logistic', logistic_pipeline)         
    ],
    final_estimator=RandomForestClassifier(
        max_depth=13, 
        criterion='log_loss', 
        n_jobs=-1, 
        random_state=42, 
        n_estimators=500,
    ),
    passthrough=True,
    n_jobs=-1,
    cv=5
)

# Fit the model
stacking_model.fit(X_train, y_train)

# Make predictions
pred_proba = stacking_model.predict_proba(X_test)[:, 1]

# Evaluate the model with ROC-AUC
roc_auc = roc_auc_score(y_test, pred_proba)
print(f'ROC AUC Score: {roc_auc}')

# Cross validation
cv_scores = cross_val_score(stacking_model, X, y, cv = 5, scoring = 'roc_auc', n_jobs = -1)
print("Cross-Validation ROC-AUC Scores:", cv_scores)
print("Mean ROC-AUC Score:", cv_scores.mean())

# Fit model with X and y
stacking_model.fit(X, y)
pred = stacking_model.predict_proba(X_test)[:,1]
ras = roc_auc_score(y_test, pred)

sub = pd.read_csv('sample_submission.csv')
sub["smoking"] = stacking_model.predict_proba(test)[:,1]
sub.to_csv("submission_best.csv", index = False)

ROC AUC Score: 0.8888831939010476
Cross-Validation ROC-AUC Scores: [0.89316748 0.88869133 0.89367808 0.88053299 0.88314659]
Mean ROC-AUC Score: 0.8878432938666926
