In [64]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [65]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('train.csv').drop(['id'],axis=1) # ma'lumotni o'qish
data

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,55.0,170.0,80.0,92.0,1.2,0.8,1.0,1.0,129.0,74.0,82.0,175.0,58.0,49.0,114.0,15.4,1.0,0.9,20.0,23.0,13.0,0.0,0.0
1,30.0,170.0,80.0,94.0,1.5,1.5,1.0,1.0,128.0,84.0,95.0,188.0,128.0,51.0,111.0,16.2,1.0,0.9,30.0,39.0,92.0,0.0,1.0
2,45.0,170.0,75.0,84.0,1.0,1.0,1.0,1.0,124.0,80.0,98.0,192.0,138.0,52.0,112.0,14.6,2.0,0.9,20.0,20.0,50.0,0.0,1.0
3,55.0,150.0,55.0,85.0,0.9,0.5,1.0,1.0,123.0,79.0,95.0,193.0,65.0,61.0,119.0,13.4,1.0,0.8,25.0,20.0,18.0,0.0,0.0
4,45.0,160.0,55.0,72.0,0.5,0.6,1.0,1.0,117.0,76.0,103.0,197.0,81.0,61.0,120.0,13.9,1.0,0.7,20.0,26.0,10.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,40.0,170.0,70.0,81.0,1.0,1.0,1.0,1.0,130.0,79.0,91.0,181.0,133.0,47.0,106.0,15.8,1.0,0.9,19.0,20.0,25.0,1.0,1.0
14996,40.0,155.0,50.0,75.0,1.0,1.2,1.0,1.0,100.0,60.0,83.0,169.0,59.0,51.0,106.0,14.1,1.0,0.8,22.0,17.0,20.0,0.0,0.0
14997,45.0,160.0,55.0,81.0,1.2,0.8,1.0,1.0,100.0,60.0,91.0,150.0,68.0,39.0,97.0,16.2,1.0,0.9,25.0,20.0,30.0,0.0,0.0
14998,50.0,160.0,60.0,80.0,0.7,1.0,1.0,1.0,120.0,80.0,90.0,258.0,97.0,88.0,151.0,14.5,1.0,0.9,19.0,13.0,11.0,0.0,0.0


# Future Engineering

In [66]:

# Feature Engineering: yangi ustunlar yaratish
data['BMI'] = data['weight(kg)'] / (data['height(cm)'] / 100) ** 2  # BMI
data['Height_to_Waist_Ratio'] = data['height(cm)'] / data['waist(cm)']  # Bo'y va bel nisbat
data['Weight_to_Waist_Ratio'] = data['weight(kg)'] / data['waist(cm)']  # Vazn va bel nisbat
data['Eyesight_Difference'] = abs(data['eyesight(left)'] - data['eyesight(right)'])  # Ko'rish farqi
data['Hearing_Difference'] = abs(data['hearing(left)'] - data['hearing(right)'])  # Eshitish farqi
data['Blood_Pressure_Difference'] = abs(data['systolic'] - data['waist(cm)'])  # Qon bosimi farqi
data['Total_Cholesterol'] = data['HDL'] + data['LDL']  # Umumiy xolesterin
data['Hemoglobin_Serum_Ratio'] = data['hemoglobin'] / data['serum creatinine']  # Gemoglobin va zardob nisbati
data['ALT_AST_Ratio'] = data['ALT'] / data['AST']  # Jigar fermentlari nisbati
data['GTP_ALT_Ratio'] = data['Gtp'] / data['ALT']  # Gtp va ALT fermentlari nisbati
data['log_weight'] = np.log1p(data['weight(kg)'])  # Vaznning logaritmasi
data['interaction_1'] = data['height(cm)'] * data['BMI']  # O'zaro ta'sir xususiyati


# Ma'lumotlarni ajratish

In [67]:
SEED  = 1
# Target (label) va features (xususiyatlar) ni ajratamiz
X = data.drop(columns=['smoking'])
y = data['smoking']

# Ma'lumotlarni o'qitish va test uchun ajratamiz
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)


# Modelni train  va predict qilish
 

In [68]:
# StratifiedKFold yaratamiz
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
# BaggingClassifier va RandomForestClassifier yaratamiz
dt = DecisionTreeClassifier(criterion='gini',
                            splitter='best',
                            #min_impurity_decrease=0.005,
                            max_depth=8,
                            #min_samples_leaf=0.16,
                            random_state=SEED)
bc = BaggingClassifier(estimator=dt,
                        bootstrap=True,
                        n_estimators=379,
                        n_jobs=-1,
                        random_state=SEED)

rf = RandomForestClassifier(n_estimators=450, random_state=SEED, n_jobs=-1)


stacking_model = StackingClassifier(
    estimators=[('bc', bc), ('rf', rf)],
    final_estimator = LogisticRegression(max_iter=1000,C=0.1,random_state=SEED),
    cv=skf,
    n_jobs=-1
)


# Pipeline yaratamiz
pipe = Pipeline([
    ('rfecv', RFECV(estimator=rf, step=2, cv=skf, scoring='roc_auc', n_jobs=-1)),  # Xususiyatlarni tanlash
    ('stacking', stacking_model)  # Stacking modelini qo'shamiz
])

# Pipeline bilan o'qitish
pipe.fit(X_train, y_train)

# Bashorat qilish
y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)[:, 1]


# Baholash
print(f"Pipeline Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Pipeline Model ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")


Pipeline Model Accuracy: 0.8083
Pipeline Model ROC AUC: 0.8916


# Test to'plami uchun future enginerring qilish

In [71]:

# Test ma'lumotlarini yuklash
df_test = pd.read_csv('test.csv').dropna(subset=['id'])

# Test to'plamda ham Feature Engineering qismida yangi ustunlarni yaratish kerak
df_test['BMI'] = df_test['weight(kg)'] / (df_test['height(cm)'] / 100) ** 2  # BMI
df_test['Height_to_Waist_Ratio'] = df_test['height(cm)'] / df_test['waist(cm)']  # Bo'y va bel nisbat
df_test['Weight_to_Waist_Ratio'] = df_test['weight(kg)'] / df_test['waist(cm)']  # Vazn va bel nisbat
df_test['Eyesight_Difference'] = abs(df_test['eyesight(left)'] - df_test['eyesight(right)'])  # Ko'rish farqi
df_test['Hearing_Difference'] = abs(df_test['hearing(left)'] - df_test['hearing(right)'])  # Eshitish farqi
df_test['Blood_Pressure_Difference'] = abs(df_test['systolic'] - df_test['waist(cm)'])  # Qon bosimi farqi
df_test['Total_Cholesterol'] = df_test['HDL'] + df_test['LDL']  # Umumiy xolesterin
df_test['Hemoglobin_Serum_Ratio'] = df_test['hemoglobin'] / df_test['serum creatinine']  # Gemoglobin va zardob nisbati
df_test['ALT_AST_Ratio'] = df_test['ALT'] / df_test['AST']  # Jigar fermentlari nisbati
df_test['GTP_ALT_Ratio'] = df_test['Gtp'] / df_test['ALT']  # Gtp va ALT fermentlari nisbati
df_test['log_weight'] = np.log1p(df_test['weight(kg)'])  # Vaznning logaritmasi
df_test['interaction_1'] = df_test['height(cm)'] * df_test['BMI']  # O'zaro ta'sir xususiyati


# Natijani yuklash

In [72]:

# Test to‘plamida ehtimollarni bashorat qilish (Stacking bilan)
y_test_prob_rf = pipe.predict_proba(df_test.drop(columns=['id']))[:, 1]

# Bashoratlarni saqlash
subm = pd.read_csv("sample_submission.csv")
subm['smoking'] = y_test_prob_rf
subm.to_csv("Haqnazar_submission.csv", index=False)
