### Step 1: Import libraries
This cell imports the main Python libraries:
- pandas and numpy for handling data
- scikit-learn for machine learning
- matplotlib for visualizing the results

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

### Step 2: Define dataset path
Make sure the folder name matches the one shown on the right sidebar after adding the dataset.

In [None]:
data_path = '/kaggle/input/amp-parkinsons-disease-progression-prediction/'

### Step 3: Load clinical, peptide, and protein CSV files
These files contain patient visit info and molecular measurements.

In [None]:
clinical_data = pd.read_csv(data_path + 'train_clinical_data.csv')
peptides_data = pd.read_csv(data_path + 'train_peptides.csv')
proteins_data = pd.read_csv(data_path + 'train_proteins.csv')

### Step 4: Aggregate peptide and protein features
- Count how many peptides/proteins were detected
- Calculate their average measurement per visit

In [None]:
# Aggregate peptide data: count and mean of PeptideAbundance per visit
peptides_agg = peptides_data.groupby('visit_id')['PeptideAbundance'].agg(
    pep_count='count',
    pep_mean='mean'
).reset_index()

# Aggregate protein data: count and mean of NPX per visit
proteins_agg = proteins_data.groupby('visit_id')['NPX'].agg(
    prot_count='count',
    prot_mean='mean'
).reset_index()

### Step 5: Merge aggregated features with clinical data
Now we have a full dataset with clinical and biological features.

In [None]:
# Merge the aggregated peptide and protein features with the clinical data
merged_data = clinical_data.merge(peptides_agg, on='visit_id', how='left')
merged_data = merged_data.merge(proteins_agg, on='visit_id', how='left')

### Step 6: Clean the dataset
- Convert medication state to numbers
- Remove unused columns
- Handle missing values

In [None]:
# Convert medication status to numerical values
merged_data['upd23b_clinical_state_on_medication'] = merged_data['upd23b_clinical_state_on_medication'].map({'On': 1, 'Off': 0})

# Drop unnecessary columns
merged_data = merged_data.drop(columns=['visit_id', 'patient_id', 'updrs_2', 'updrs_3', 'updrs_4'])

# Handle missing values
merged_data = merged_data.dropna(subset=['updrs_1'])
merged_data = merged_data.fillna(0)

### Step 7: Split into train and test sets
We use 80% of the data to train the model and 20% to test it.

In [None]:
# Define features (X) and target variable (y)
X = merged_data.drop(columns=['updrs_1'])
y = merged_data['updrs_1']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 8: Train the KNN model
We use K=5, meaning it looks at the 5 nearest neighbors to predict the score.

In [None]:
# Initialize the KNN regressor with 5 neighbors
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train the model on the training data
knn_model.fit(X_train, y_train)

### Step 9: Evaluate the model
We use Mean Absolute Error (MAE) to measure accuracy. Lower is better.

In [None]:
# Predict UPDRS_1 scores on the test set
y_pred = knn_model.predict(X_test)

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

### Step 10: Plot predictions vs actual scores
Each dot is a patient visit.
- X-axis = real score
- Y-axis = predicted score
- Red line = perfect prediction

In [None]:
# Plot actual vs. predicted UPDRS_1 scores
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', edgecolors='k')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual UPDRS_1")
plt.ylabel("Predicted UPDRS_1")
plt.title("KNN Predictions vs. Actual UPDRS_1 Scores")