In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold

# Sample data (replace this with your actual data loading method)
data = pd.read_csv('parkinsons_updrs.data')



df = pd.DataFrame(data)

# Define features and labels
X = df.drop(columns=['subject#', 'motor_UPDRS', 'total_UPDRS'])
y_motor = df['motor_UPDRS']
y_total = df['total_UPDRS']


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_total, train_size=0.7, random_state=42)

X_train_motor, X_test_motor, y_train_motor, y_test_motor = train_test_split(X, y_motor, train_size=0.7, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (4112, 19)
Test set size: (1763, 19)


In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge




In [4]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the number of components to try
M_values = [5, 10, 15, X_scaled.shape[1]]

# Initialize the Kernel Ridge model with Gaussian kernel
kr = KernelRidge(kernel='rbf')

# Function to evaluate the model
def evaluate_model(X, y, M_values):
    results = {}
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for M in M_values:
        if M != X.shape[1]:
            pca = PCA(n_components=M)
            X_transformed = pca.fit_transform(X)
        else:
            X_transformed = X  # Use original features for M = p
        
        train_scores = []
        test_scores = []
        
        for train_index, test_index in kf.split(X_transformed):
            X_train, X_test = X_transformed[train_index], X_transformed[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            kr.fit(X_train, y_train)
            y_train_pred = kr.predict(X_train)
            y_test_pred = kr.predict(X_test)
            
            train_scores.append(r2_score(y_train, y_train_pred))
            test_scores.append(r2_score(y_test, y_test_pred))
        
        results[M] = {
            'train_score': np.mean(train_scores),
            'test_score': np.mean(test_scores)
        }
    
    return results

# Evaluate the model for both motor_UPDRS and total_UPDRS
motor_results = evaluate_model(X_scaled, y_motor.values, M_values)
total_results = evaluate_model(X_scaled, y_total.values, M_values)



In [6]:

print("Motor UPDRS:",motor_results)
print("Total UPDRS:",total_results)

Motor UPDRS: {5: {'train_score': 0.42200632813191863, 'test_score': 0.34095385341578466}, 10: {'train_score': 0.6041058352895777, 'test_score': 0.5256389158734461}, 15: {'train_score': 0.5777291650975496, 'test_score': 0.5132130896672098}, 19: {'train_score': 0.546194314349432, 'test_score': 0.49262843286854513}}
Total UPDRS: {5: {'train_score': 0.4126732178822743, 'test_score': 0.3307567084394762}, 10: {'train_score': 0.5818162165472062, 'test_score': 0.49994642800100086}, 15: {'train_score': 0.5575531316496263, 'test_score': 0.49050823181220354}, 19: {'train_score': 0.5258279624821681, 'test_score': 0.4701436393926569}}


In [7]:
from sklearn.neural_network import MLPRegressor




# Define features and labels
X = df.drop(columns=['subject#', 'motor_UPDRS', 'total_UPDRS'])
y = df[['motor_UPDRS', 'total_UPDRS']]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, random_state=42)

# Function to train and evaluate the model
def train_and_evaluate(hidden_layer_sizes, activation, solver, alpha, learning_rate):
    # Define the MLPRegressor model
    mlp = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=2000,
        random_state=42,
        early_stopping=False
    )
    
    # Train the model
    mlp.fit(X_train, y_train)
    
    # Predict on training and test sets
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    # Calculate R2 scores
    train_r2 = r2_score(y_train, y_train_pred, multioutput='uniform_average')
    test_r2 = r2_score(y_test, y_test_pred, multioutput='uniform_average')
    
    return train_r2, test_r2

# Example parameters (tune these by trial and error)
hidden_layer_sizes = (50,)  # Single hidden layer with 50 neurons
activation = 'relu'
solver = 'adam'
alpha = 0.001  # L2 regularization parameter
learning_rate = 'constant'

train_r2, test_r2 = train_and_evaluate(hidden_layer_sizes, activation, solver, alpha, learning_rate)

print("Training R2:", train_r2)
print("Test R2:", test_r2)

# Loop through different parameter combinations to find the best architecture
results = []
for hidden_layer_sizes in [(50,), (100,), (50, 50)]:
    for activation in ['relu', 'tanh']:
        for solver in ['adam', 'sgd']:
            for alpha in [0.0001, 0.001, 0.01]:
                for learning_rate in ['constant', 'adaptive']:
                    train_r2, test_r2 = train_and_evaluate(hidden_layer_sizes, activation, solver, alpha, learning_rate)
                    results.append({
                        'hidden_layer_sizes': hidden_layer_sizes,
                        'activation': activation,
                        'solver': solver,
                        'alpha': alpha,
                        'learning_rate': learning_rate,
                        'train_r2': train_r2,
                        'test_r2': test_r2
                    })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)


# Display the best parameters based on test R2 score
best_params = results_df.loc[results_df['test_r2'].idxmax()]
print("Best Parameters:", best_params)

Training R2: 0.721963689305537
Test R2: 0.6469816831608963




Best Parameters: hidden_layer_sizes    (50, 50)
activation                tanh
solver                     sgd
alpha                     0.01
learning_rate         constant
train_r2              0.976593
test_r2               0.850079
Name: 70, dtype: object


In [8]:

# Function to train and evaluate the model with early stopping
def train_and_evaluate_with_early_stopping(hidden_layer_sizes, activation, solver, alpha, learning_rate, validation_fraction):
    # Define the MLPRegressor model
    mlp = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=2000,
        random_state=42,
        early_stopping=True,
        validation_fraction=validation_fraction
    )
    
    # Train the model
    mlp.fit(X_train, y_train)
    
    # Predict on training and test sets
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    # Calculate R2 scores
    train_r2 = r2_score(y_train, y_train_pred, multioutput='uniform_average')
    test_r2 = r2_score(y_test, y_test_pred, multioutput='uniform_average')
    
    return train_r2, test_r2

# Example parameters based on the best parameters from the previous part
hidden_layer_sizes = (50,50)  # Single hidden layer with 50 neurons
activation = 'tanh'
solver = 'sgd'
alpha = 0.01  # L2 regularization parameter
learning_rate = 'constant'
validation_fraction = 0.1  # Default validation fraction

train_r2, test_r2 = train_and_evaluate_with_early_stopping(hidden_layer_sizes, activation, solver, alpha, learning_rate, validation_fraction)

print("Training R2 with early stopping:", train_r2)
print("Test R2 with early stopping:", test_r2)

# Experiment with different validation fractions to see if a better model can be obtained
results = []
for validation_fraction in [0.1, 0.2, 0.3]:
    train_r2, test_r2 = train_and_evaluate_with_early_stopping(hidden_layer_sizes, activation, solver, alpha, learning_rate, validation_fraction)
    results.append({
        'validation_fraction': validation_fraction,
        'train_r2': train_r2,
        'test_r2': test_r2
    })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)


# Display the best parameters based on test R2 score
best_params = results_df.loc[results_df['test_r2'].idxmax()]
print("Best Parameters with Early Stopping:", best_params)

Training R2 with early stopping: 0.802086156122523
Test R2 with early stopping: 0.7329439538341391
Best Parameters with Early Stopping: validation_fraction    0.300000
train_r2               0.911098
test_r2                0.823081
Name: 2, dtype: float64
