# 1. Problem Information
- **Name:** [**Alzheimer’s Diagnosis Prediction System**](https://platform.olimpiada-ai.ro/en/problems/63)
- **Date:** 12/02/2026
- **Type:** (Regression / Binary Classification / Multi-class / NLP / CV)

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# 3. Data preparation

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print(train.shape)
train.head(5)

(1719, 34)


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,5531,86,0,2,1,28.155961,0,18.629164,6.535593,1.316452,...,5.566873,0,0,0.232938,0,0,0,0,1,0
1,6092,62,1,0,3,28.18921,1,7.763349,1.913491,6.023577,...,9.307896,0,0,0.442326,0,0,0,0,0,0
2,5571,75,0,0,0,25.391526,0,8.449637,9.549369,1.774418,...,4.001694,0,0,4.918146,0,0,0,0,0,1
3,5608,89,0,0,0,23.581751,0,1.939227,1.322465,8.758693,...,9.700073,1,0,5.15719,0,1,0,0,1,0
4,5344,76,1,0,2,38.76859,1,9.013919,8.296794,7.892595,...,8.626984,0,0,9.413984,0,0,0,0,1,0


In [3]:
train.describe().round(3)

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,...,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0,1719.0
mean,5820.436,74.902,0.505,0.703,1.286,27.683,0.292,10.111,4.907,5.033,...,5.117,0.205,0.146,4.979,0.205,0.154,0.151,0.158,0.306,0.354
std,621.194,9.02,0.5,0.999,0.901,7.231,0.455,5.764,2.852,2.912,...,2.922,0.404,0.353,2.961,0.404,0.361,0.358,0.365,0.461,0.478
min,4751.0,60.0,0.0,0.0,0.0,15.009,0.0,0.002,0.004,0.009,...,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0
25%,5285.5,67.0,0.0,0.0,1.0,21.624,0.0,5.199,2.576,2.492,...,2.553,0.0,0.0,2.283,0.0,0.0,0.0,0.0,0.0,0.0
50%,5827.0,75.0,1.0,0.0,1.0,27.912,0.0,9.995,4.733,5.16,...,5.174,0.0,0.0,5.026,0.0,0.0,0.0,0.0,0.0,0.0
75%,6351.0,83.0,1.0,1.0,2.0,33.855,1.0,15.256,7.394,7.579,...,7.647,0.0,0.0,7.604,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.989,1.0,19.989,9.987,9.998,...,9.993,1.0,1.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0


# 4. Models

In [4]:
X = train.iloc[:,1:-1]
Y = train['Diagnosis']
pipeline = make_pipeline(StandardScaler(),GradientBoostingClassifier(random_state=0))
scores = cross_val_score(pipeline,X,Y,cv=3)
print(scores.mean())

0.944153577661431


In [5]:
pipeline.fit(X,Y)
predictions = pipeline.predict_proba(test.iloc[:,1:])[:,1]

# 5. Submission

In [6]:
#(number of smokers in train with Age = v) / (total number of patients in train with Age = v) * 100
age_counts = train['Age'].value_counts()
task1 = test['Age'].map(age_counts)
task2 = test['Age'].map(train.groupby('Age')['Smoking'].mean() * 100).fillna(0)

In [7]:
df_task1 = pd.DataFrame({
    "subtaskID": [1] *len(test['PatientID']),
    "datapointID": test['PatientID'],
    "answer": task1
})

df_task2 = pd.DataFrame({
    "subtaskID": [2] *len(test['PatientID']),
    "datapointID": test['PatientID'],
    "answer": task2
})

df_task3 = pd.DataFrame({
    "subtaskID": [3] *len(test['PatientID']),
    "datapointID": test['PatientID'],
    "answer": predictions
})

submission = pd.concat([df_task1, df_task2, df_task3])
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,5202,44.0
1,1,6831,66.0
2,1,6407,50.0
3,1,5821,64.0
4,1,5581,56.0


In [8]:
submission.to_csv("submission.csv", index=False)