In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss
import numpy as np
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
ids = data['Id']
data = data.drop('Id', axis=1)
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BN', 'BQ', 'BR', 'BZ', 'CB', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EP', 'EU', 'FE', 'FI', 'FR', 'GB', 'GE', 'GF', 'GH', 'GI']
target = 'Class'

In [3]:
for column in features:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna(data[column].median())
X = data[features]
y = data[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],  
    'max_depth': [None, 10, 20, 30],  
    'random_state': [42],  
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_log_loss')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best parameters: {'max_depth': None, 'n_estimators': 200, 'random_state': 42}


In [6]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
print(feature_importances)

   Feature  Importance
32      EH    0.076214
20      DA    0.063733
10      BQ    0.060850
37      FR    0.060201
24      DI    0.054527
17      CR    0.048905
35      FE    0.034885
0       AB    0.034456
8       BC    0.031887
36      FI    0.029591
30      EE    0.029191
26      DN    0.027987
1       AF    0.027546
19      CU    0.025733
23      DH    0.023904
21      DE    0.023131
41      GH    0.019282
11      BR    0.018828
9       BN    0.018530
40      GF    0.018388
15      CH    0.018107
3       AM    0.017737
25      DL    0.015982
38      GB    0.014852
18      CS    0.014747
14      CF    0.014427
29      EB    0.014411
5       AX    0.013953
42      GI    0.013948
31      EG    0.013795
13      CB    0.012156
28      DY    0.012029
27      DV    0.011859
7       AZ    0.010564
34      EU    0.010277
6       AY    0.009848
22      DF    0.009468
4       AR    0.009308
33      EP    0.007833
2       AH    0.007823
16      CL    0.006698
12      BZ    0.006306
39      GE 

In [7]:
important_features = feature_importances[feature_importances['Importance'] > 0.01]['Feature']

In [8]:
X_train = X_train[important_features]
X_test = X_test[important_features]

In [9]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                               max_depth=best_params['max_depth'], 
                               random_state=best_params['random_state'])

In [10]:
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_log_loss')
log_loss_scores = -scores
print(f'Log Loss scores for each fold: {log_loss_scores}')
average_log_loss = np.mean(log_loss_scores)
print(f'Average Log Loss: {average_log_loss}')

Log Loss scores for each fold: [0.24505786 0.27849377 0.35209726 0.27951359 0.3002465 ]
Average Log Loss: 0.2910817928967755


In [11]:
model.fit(X_train_scaled, y_train)
test_data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test_ids = test_data['Id']
test_data = test_data.drop('Id', axis=1)
test_data = test_data.replace([np.inf, -np.inf], np.nan)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

In [12]:
test_data_scaled = scaler.transform(test_data[important_features])
test_preds = model.predict_proba(test_data_scaled)
submission = pd.DataFrame(test_ids, columns=['Id'])
submission[['class_0', 'class_1']] = test_preds
submission.to_csv('submission.csv', index=False)