# 1. Problem Information
- **Name:** [**Diabetes Diagnosis Based on Blood Tests**](https://platform.olimpiada-ai.ro/en/problems/22)
- **Date:** 12/02/2026
- **Type:** Binary Classification

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

# 3. Data preparation

In [2]:
def ProcessData(df):
     cols_with_missing = [
        "glucose",
        "blood_pressure",
        "skin_thickness",
        "insulin",
        "bmi"
     ]
     imputer = KNNImputer(n_neighbors=5)
     df[cols_with_missing] = df[cols_with_missing].replace(0, np.nan)
     df[cols_with_missing] = imputer.fit_transform(df[cols_with_missing])

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
ProcessData(train)
ProcessData(test)

print(train.shape)
train.head(5)

(537, 10)


Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,target,SampleID
0,7,184.0,84.0,33.0,192.6,35.5,0.355,41,1,210
1,6,85.0,78.0,23.4,54.0,31.2,0.382,42,0,177
2,2,106.0,64.0,35.0,119.0,30.5,1.4,34,0,148
3,2,100.0,54.0,28.0,105.0,37.8,0.498,24,0,455
4,5,104.0,74.0,25.0,92.6,28.8,0.153,48,0,637


In [4]:
train.describe().round(3)

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree_function,age,target,SampleID
count,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0
mean,3.883,121.67,72.247,28.641,145.509,32.621,0.481,33.551,0.348,384.115
std,3.35,29.973,12.233,8.899,91.071,6.746,0.335,11.795,0.477,222.32
min,0.0,57.0,24.0,7.0,16.0,18.2,0.084,21.0,0.0,1.0
25%,1.0,99.0,64.0,23.0,87.0,27.8,0.246,24.0,0.0,190.0
50%,3.0,117.0,72.0,28.6,126.0,32.4,0.385,30.0,0.0,378.0
75%,6.0,141.0,80.0,34.0,182.0,36.5,0.645,41.0,1.0,578.0
max,17.0,198.0,122.0,60.0,744.0,67.1,2.329,81.0,1.0,767.0


# 4. Models

In [5]:
X = train.iloc[:,:-2]
Y = train.iloc[:,-2]

In [6]:
pipeline = make_pipeline(StandardScaler(),RandomForestClassifier(random_state=0))

params = {
    'randomforestclassifier__n_estimators':[50,100,200],
    'randomforestclassifier__max_depth':[None,2,5,8]
}
grid_search = RandomizedSearchCV(pipeline,params,cv=5,scoring="f1",n_jobs=-1)
grid_search.fit(X,Y)

print("Best score:",grid_search.best_score_)
print("Best parmas:",grid_search.best_params_)

Best score: 0.6487157070322127
Best parmas: {'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__max_depth': 5}


In [7]:
best_model = grid_search.best_estimator_
prediction = best_model.predict(test.iloc[:,:-1])

# 5. Submission

In [8]:
submission = pd.DataFrame({
    "SampleID": test['SampleID'],
    "label": prediction,
})
submission.head()

Unnamed: 0,SampleID,label
0,731,0
1,199,0
2,25,1
3,418,1
4,388,1


In [9]:
submission.to_csv("submission.csv", index=False)