In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [72]:
train = pd.read_csv('/kaggle/input/competitions/playground-series-s6e2/train.csv')
test = pd.read_csv('//kaggle/input/competitions/playground-series-s6e2/test.csv')


In [73]:
print('size train-', train.shape)
print('size test-', test.shape)

size train- (630000, 15)
size test- (270000, 14)


In [74]:
print('пропуски в train-', train.isnull().sum().sum())
print('пропуски в test-', test.isnull().sum().sum())

пропуски в train- 0
пропуски в test- 0


In [75]:
print('train:', train.columns.tolist())
print('test:', test.columns.tolist())

train: ['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease']
test: ['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']


In [76]:
print(train['Heart Disease'].value_counts())

Heart Disease
Absence     347546
Presence    282454
Name: count, dtype: int64


In [77]:
train['Heart Disease'] = train['Heart Disease'].map({'Presence': 1, 'Absence': 0})

In [78]:
print(train['Heart Disease'].value_counts())

Heart Disease
0    347546
1    282454
Name: count, dtype: int64


In [79]:
target = train['Heart Disease']

In [80]:
train = train.drop(['id', 'Heart Disease'], axis=1)
test_ids = test['id']
test = test.drop('id', axis=1)

In [81]:
print( train.shape)
print(test.shape)

(630000, 13)
(270000, 13)


In [82]:
train['age_old'] = train['Age'] * train['ST depression']
train['chol_age'] = train['Cholesterol'] / (train['Age'] + 1)
train['max_hr_age'] = train['Max HR'] - train['Age']
train['bp_chol'] = train['BP'] * train['Cholesterol'] / 1000000
train['age_bp'] = train['Age'] * train['BP'] / 100
train['old_slope'] = train['ST depression'] * train['Slope of ST']
train['resting_bp_risk'] = (train['BP'] > 140).astype(int)
train['chol_risk'] = (train['Cholesterol'] > 240).astype(int)

test['age_old'] = test['Age'] * test['ST depression']
test['chol_age'] = test['Cholesterol'] / (test['Age'] + 1)
test['max_hr_age'] = test['Max HR'] - test['Age']
test['bp_chol'] = test['BP'] * test['Cholesterol'] / 1000000
test['age_bp'] = test['Age'] * test['BP'] / 100
test['old_slope'] = test['ST depression'] * test['Slope of ST']
test['resting_bp_risk'] = (test['BP'] > 140).astype(int)
test['chol_risk'] = (test['Cholesterol'] > 240).astype(int)

In [83]:
X_train, X_val, y_train, y_val = train_test_split(
    train, target, test_size=0.2, random_state=42, stratify=target
)

In [84]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=20,
    random_state=42,
    n_jobs=-1
)

In [85]:
model.fit(X_train, y_train)

In [86]:
val_pred = model.predict_proba(X_val)[:, 1]
val_score = roc_auc_score(y_val, val_pred)
print(round(val_score, 4))

0.9532


In [87]:
importances = pd.DataFrame({
    'feature': train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)
print(importances)

                    feature  importance
12                 Thallium    0.280604
2           Chest pain type    0.177346
11  Number of vessels fluro    0.101346
8           Exercise angina    0.084855
15               max_hr_age    0.069358
18                old_slope    0.064126
13                  age_old    0.055568
7                    Max HR    0.046873
10              Slope of ST    0.039030
9             ST depression    0.031013


In [88]:
test_pred = model.predict_proba(test)[:, 1]

In [89]:
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': test_pred
})

In [90]:
submission.to_csv('submission.csv', index=False)
print('\nсохранили!size submission:', submission.shape)
print(submission.head())
print('\nend')


сохранили!size submission: (270000, 2)
       id  Heart Disease
0  630000       0.738526
1  630001       0.013304
2  630002       0.959935
3  630003       0.009144
4  630004       0.408561

end
