In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
ids = data['Id']
data = data.drop('Id', axis=1)
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BN', 'BQ', 'BR', 'BZ', 'CB', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EP', 'EU', 'FE', 'FI', 'FR', 'GB', 'GE', 'GF', 'GH', 'GI']
target = 'Class'

In [3]:
for column in features:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna(data[column].median())
X = data[features]
y = data[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5, random_state=42)

In [7]:
scores = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_log_loss')
log_loss_scores = -scores
print(f'Log Loss scores for each fold: {log_loss_scores}')
average_log_loss = np.mean(log_loss_scores)
print(f'Average Log Loss: {average_log_loss}')

Log Loss scores for each fold: [0.24814792 0.27498943 0.23762069 0.34042647 0.39100394 0.2576977
 0.31916388 0.26687743 0.30042487 0.3348471 ]
Average Log Loss: 0.29711994269813485


In [8]:
model.fit(X_train_scaled, y_train)

In [9]:
test_data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test_ids = test_data['Id']
test_data = test_data.drop('Id', axis=1)
test_data = test_data.replace([np.inf, -np.inf], np.nan)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)
test_data_scaled = scaler.transform(test_data[features])
test_preds = model.predict_proba(test_data_scaled)

In [10]:
submission = pd.DataFrame(test_ids, columns=['Id'])
submission[['class_0', 'class_1']] = test_preds
submission.to_csv('submission.csv', index=False)