# 1. Problem Information
- **Name:** [**Analysis and Classification of Parkinson's Risks and Symptoms**](https://platform.olimpiada-ai.ro/en/problems/79)
- **Date:** 13/02/2026
- **Type:** Binary Classification

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from catboost import CatBoostClassifier

# 3. Data preparation

In [2]:
def ProcessData(df):
    df.drop(columns=['EyeColor','Gender'],inplace=True)
    df['CardiometabolicRiskScore'] = (df.Hypertension == 1).astype(int) + (df.Diabetes == 1).astype(int) + (df.BMI > 30).astype(int)
    df['LifestyleRiskIndex'] = (df.Smoking == 1).astype(int) + (df.AlcoholConsumption > 2).astype(int) + (df.PhysicalActivity < 1).astype(int)
    return df
    
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train = ProcessData(train)
train = train[[col for col in train.columns if col != 'Diagnosis'] +['Diagnosis']]
test = ProcessData(test)

print(train.shape)
train.head(5)

(1684, 39)


Unnamed: 0,PatientID,Age,AgeGroup,AlcoholConsumption,BMI,Bradykinesia,CholesterolHDL,CholesterolLDL,CholesterolTotal,CholesterolTriglycerides,...,Smoking,SpeechProblems,Stroke,SystolicBP,TraumaticBrainInjury,Tremor,UPDRS,CardiometabolicRiskScore,LifestyleRiskIndex,Diagnosis
0,d4e0fb07-37c6-4c7d-a9d5-bfadc41b8164,56,1,14.40175,38.165782,False,98.305359,185.601755,214.446455,177.613258,...,0,True,0,173,0,False,114.941744,1,1,0
1,97cb29cd-aa0c-4802-80ee-4fb82c0d5059,84,3,15.545237,33.877785,False,29.089431,130.446298,168.545178,237.987107,...,0,False,0,111,0,True,191.992824,1,1,1
2,d9584f0e-f6fb-4821-8737-679a25f4c5cd,53,1,5.942235,30.111818,False,40.764986,186.558645,291.316103,342.071323,...,0,False,0,161,0,False,121.425375,1,1,0
3,d3cb6b85-7286-4b09-9f6f-349ad8e34bd1,88,3,7.315375,19.931085,False,38.752199,191.811289,174.858648,375.127417,...,0,False,0,122,1,False,30.952378,0,1,0
4,d6b091f4-099a-4f85-8417-2cb7110e4f93,77,3,6.037814,32.591481,True,32.477083,118.043431,231.507811,385.517466,...,0,False,0,113,0,True,63.069273,1,1,1


# 4. Models

In [3]:
X = train.iloc[:,1:-1]
Y = train['Diagnosis']

transformer = make_column_transformer((OrdinalEncoder(),make_column_selector(dtype_include=object)),remainder=StandardScaler())
pipeline = make_pipeline(transformer,CatBoostClassifier(random_state=0,verbose=500))

scores = cross_val_score(pipeline,X,Y,cv=3,scoring='roc_auc')

print(scores.mean())

Learning rate set to 0.010821
0:	learn: 0.6861625	total: 153ms	remaining: 2m 32s
500:	learn: 0.1519099	total: 791ms	remaining: 788ms
999:	learn: 0.0759545	total: 1.98s	remaining: 0us
Learning rate set to 0.010825
0:	learn: 0.6862466	total: 1.93ms	remaining: 1.93s
500:	learn: 0.1331418	total: 1.86s	remaining: 1.85s
999:	learn: 0.0632248	total: 3.62s	remaining: 0us
Learning rate set to 0.010825
0:	learn: 0.6858987	total: 4.88ms	remaining: 4.88s
500:	learn: 0.1364003	total: 1.84s	remaining: 1.83s
999:	learn: 0.0625735	total: 3.47s	remaining: 0us
0.9682119571508995


In [4]:
best_model = pipeline.fit(X,Y)
predictions = best_model.predict(test.iloc[:,1:])

Learning rate set to 0.01287
0:	learn: 0.6851633	total: 3.69ms	remaining: 3.69s
500:	learn: 0.1314808	total: 1.98s	remaining: 1.97s
999:	learn: 0.0678167	total: 3.93s	remaining: 0us


# 5. Submission

In [5]:
task1 = (test.Hypertension == 1).astype(int) + (test.Diabetes == 1).astype(int) + (test.BMI > 30).astype(int)
task2 = (test.Smoking == 1).astype(int) + (test.AlcoholConsumption > 2).astype(int) + (test.PhysicalActivity < 1).astype(int)

In [6]:
df_task1 = pd.DataFrame({
    "subtaskID": ['Task1'] * len(test),
    "PatientID ": test.PatientID,
    "Answer": task1
})

df_task2 = pd.DataFrame({
    "subtaskID": ['Task2'] * len(test),
    "PatientID ": test.PatientID,
    "Answer": task2
})
df_task3 = pd.DataFrame({
    "subtaskID": ['Task3'] * len(test),
    "PatientID ": test.PatientID,
    "Answer": predictions
})

submission = pd.concat([df_task1, df_task2, df_task3])
submission.head()

Unnamed: 0,subtaskID,PatientID,Answer
0,Task1,a00fa494-651e-4674-8fb8-aa006b14cbf7,0
1,Task1,f31cda69-4e3f-4265-928c-3fe9b83f0896,1
2,Task1,e4d181d2-803a-42f5-8065-239dd591fb9d,0
3,Task1,c585efb2-37e3-4620-aeda-a2299191d3a9,2
4,Task1,74fbfa2b-901e-4028-b325-9aaaa07cbf4d,0


In [7]:
submission.to_csv("submission.csv", index=False)