In [3]:
import pandas as pd
import numpy as np
import zipfile
import io # Used to read data from the zip file object

# --- 1. Corrected File Path and Archive Name ---
# The original file name is 'parkinsons.zip'.
# The nested CSV file path *within* the archive is:
# 'telemonitoring/parkinsons_updrs.data'

zip_file_name = 'parkinsons.zip'
nested_file_path = 'telemonitoring/parkinsons_updrs.data'

# --- 2. Load the Dataset Safely from the ZIP Archive ---
try:
    # 1. Open the zip file
    with zipfile.ZipFile(zip_file_name, 'r') as z:
        # 2. Open the nested file within the archive
        # The 'io.BytesIO' object allows pandas to read the file contents directly
        # from memory without extracting the file to disk first.
        with z.open(nested_file_path) as f:
            # 3. Read the file content directly into a DataFrame
            # The file is correctly read as a CSV since its contents are comma-separated.
            df = pd.read_csv(f)

    print(f"‚úÖ Data loaded successfully from '{nested_file_path}' inside '{zip_file_name}'. Shape: {df.shape}")

except FileNotFoundError:
    print(f"‚ùå ERROR: Zip file '{zip_file_name}' not found. Check if the file was uploaded to the correct location.")
except KeyError:
    print(f"‚ùå ERROR: CSV file '{nested_file_path}' not found inside '{zip_file_name}'. Check the exact file path inside the zip.")
    df = None

if df is not None:
    # ------------------------------------------------------------
    # --- 3. Initial Data Cleaning and Preparation (as requested) ---
    # ------------------------------------------------------------

    # Check for missing values (Nulls)
    print("\n--- Missing Value Count per Column ---")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    # The original dataset info suggests no missing values, but checking is always good practice.

    # Check for and Remove Duplicate Rows
    duplicate_rows = df.duplicated().sum()
    if duplicate_rows > 0:
        print(f"\nüóëÔ∏è Found {duplicate_rows} duplicate rows. Removing...")
        df.drop_duplicates(inplace=True)
        print(f"New data shape after removing duplicates: {df.shape}")
    else:
        print("\n‚úÖ No duplicate rows found.")

    # Drop the 'subject#' column as it's an identifier and not typically used for training
    columns_to_drop = ['subject#']
    if all(col in df.columns for col in columns_to_drop):
        df_cleaned = df.drop(columns=columns_to_drop)
        print(f"\nüßπ Dropped columns: {columns_to_drop}")
    else:
        df_cleaned = df
        print("\nNote: Columns to drop were not found; no columns were dropped.")

    # --- 4. Final Data Summary ---
    print("\n--- Final Cleaned Data Info ---")
    print(f"Final Data Shape: {df_cleaned.shape}")
    print(df_cleaned.head())

    print("\nüéâ Data cleaning and loading is complete! The dataset is now in the 'df_cleaned' DataFrame.")

‚úÖ Data loaded successfully from 'telemonitoring/parkinsons_updrs.data' inside 'parkinsons.zip'. Shape: (5875, 22)

--- Missing Value Count per Column ---
Series([], dtype: int64)

‚úÖ No duplicate rows found.

üßπ Dropped columns: ['subject#']

--- Final Cleaned Data Info ---
Final Data Shape: (5875, 21)
   age  sex  test_time  motor_UPDRS  total_UPDRS  Jitter(%)  Jitter(Abs)  \
0   72    0     5.6431       28.199       34.398    0.00662     0.000034   
1   72    0    12.6660       28.447       34.894    0.00300     0.000017   
2   72    0    19.6810       28.695       35.389    0.00481     0.000025   
3   72    0    25.6470       28.905       35.810    0.00528     0.000027   
4   72    0    33.6420       29.187       36.375    0.00335     0.000020   

   Jitter:RAP  Jitter:PPQ5  Jitter:DDP  ...  Shimmer(dB)  Shimmer:APQ3  \
0     0.00401      0.00317     0.01204  ...        0.230       0.01438   
1     0.00132      0.00150     0.00395  ...        0.179       0.00994   
2     0.0020

In [4]:
# Assuming the 'df_cleaned' DataFrame from the previous step is available

# --- 1. Feature (X) and Target (y) Separation ---
# The goal is typically to predict one of the UPDRS scores.
# Let's choose 'total_UPDRS' as the target variable (y).

target_column = 'total_UPDRS'
features = df_cleaned.drop(columns=[target_column, 'motor_UPDRS'])
# Drop the target and the other highly correlated UPDRS score
# to make the prediction task more meaningful/challenging for the ML model.

X = features
y = df_cleaned[target_column]

print(f"\n‚úÖ Target Variable (y): '{target_column}'")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# --------------------------------------------------------------------------

# --- 2. Quick Exploratory Data Analysis (EDA) ---
# Check the distribution and correlation of variables.

print("\n--- Summary Statistics of Features ---")
print(X.describe().T)

# Optional: Check correlation with the target (uncomment if needed)
# print("\n--- Top 5 Features Correlated with total_UPDRS ---")
# print(df_cleaned.corr()[target_column].sort_values(ascending=False).head(6))

# --------------------------------------------------------------------------

# --- 3. Feature Scaling (Standardization) ---
# Most ML algorithms perform better when numerical input features are scaled.
# We will use StandardScaler to center the data around 0 with a unit standard deviation.

from sklearn.preprocessing import StandardScaler

print("\n‚öôÔ∏è Applying Feature Scaling (Standardization)...")

# Initialize the Scaler
scaler = StandardScaler()

# Fit the scaler to the features and transform the data
X_scaled_array = scaler.fit_transform(X)

# Convert the scaled array back into a DataFrame for easier viewing/handling
X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns)

print("Scaled features head:")
print(X_scaled.head())

# --------------------------------------------------------------------------

# --- 4. Train-Test Split ---
# This step is crucial to prevent **overfitting**. We reserve a portion of the data
# for evaluation after the model has been trained.

from sklearn.model_selection import train_test_split

# Split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, # Use the scaled features!
    y,
    test_size=0.2,
    random_state=42 # Set a random state for reproducibility
)

print("\n--- Final Data Split Status ---")
print(f"X_train shape: {X_train.shape} (80% for training)")
print(f"X_test shape:  {X_test.shape} (20% for testing)")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")

print("\nüéâ Data preparation is complete! You are now ready for model training.")


‚úÖ Target Variable (y): 'total_UPDRS'
Features (X) shape: (5875, 19)
Target (y) shape: (5875,)

--- Summary Statistics of Features ---
                count       mean        std        min        25%        50%  \
age            5875.0  64.804936   8.821524  36.000000  58.000000  65.000000   
sex            5875.0   0.317787   0.465656   0.000000   0.000000   0.000000   
test_time      5875.0  92.863722  53.445602  -4.262500  46.847500  91.523000   
Jitter(%)      5875.0   0.006154   0.005624   0.000830   0.003580   0.004900   
Jitter(Abs)    5875.0   0.000044   0.000036   0.000002   0.000022   0.000035   
Jitter:RAP     5875.0   0.002987   0.003124   0.000330   0.001580   0.002250   
Jitter:PPQ5    5875.0   0.003277   0.003732   0.000430   0.001820   0.002490   
Jitter:DDP     5875.0   0.008962   0.009371   0.000980   0.004730   0.006750   
Shimmer        5875.0   0.034035   0.025835   0.003060   0.019120   0.027510   
Shimmer(dB)    5875.0   0.310960   0.230254   0.026000   0.1750

In [5]:
# --- 5. Model Training and Evaluation ---

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

print("\nüöÄ Training Random Forest Regressor...")
# Train the Model
model.fit(X_train, y_train)
print("Training complete.")

# Make Predictions on the Test Set
y_pred = model.predict(X_test)

# Evaluate the Model Performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # Root Mean Squared Error (RMSE) is more interpretable
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} (Prediction error in UPDRS units)")
print(f"R-squared Score (R¬≤): {r2:.4f} (Closer to 1.0 is better)")

print("\nFinishing touch: Displaying Feature Importance")
# Show which features the model found most important
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False).head(5))


üöÄ Training Random Forest Regressor...
Training complete.

--- Model Evaluation ---
Mean Squared Error (MSE): 2.57
Root Mean Squared Error (RMSE): 1.60 (Prediction error in UPDRS units)
R-squared Score (R¬≤): 0.9768 (Closer to 1.0 is better)

Finishing touch: Displaying Feature Importance
age          0.639437
sex          0.088428
DFA          0.087588
test_time    0.067830
HNR          0.032121
dtype: float64


In [7]:
import pandas as pd
import numpy as np
import zipfile
import io
import joblib # Required for saving the model/scaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files # Required for downloading files

# --- A. Data Loading and Cleaning ---
print("--- A. Data Loading and Cleaning ---")
zip_file_name = 'parkinsons.zip'
nested_file_path = 'telemonitoring/parkinsons_updrs.data'

try:
    with zipfile.ZipFile(zip_file_name, 'r') as z:
        with z.open(nested_file_path) as f:
            df = pd.read_csv(f)
except Exception as e:
    print(f"Error loading data: {e}")
    df = None

if df is not None:
    # Remove duplicates
    df.drop_duplicates(inplace=True)
    # Drop identifier column
    df_cleaned = df.drop(columns=['subject#'])

    # --- B. Data Preparation ---
    print("--- B. Data Preparation ---")
    target_column = 'total_UPDRS'
    features = df_cleaned.drop(columns=[target_column, 'motor_UPDRS'])

    X = features
    y = df_cleaned[target_column]

    # Feature Scaling
    scaler = StandardScaler()
    X_scaled_array = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled,
        y,
        test_size=0.2,
        random_state=42
    )

    # --- C. Hyperparameter Tuning and Training ---
    print("--- C. Hyperparameter Tuning and Training ---")

    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [10, 20],
        'min_samples_split': [5, 10]
    }

    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)

    print(f"Best Parameters Found: {best_params}")
    print(f"Best CV RMSE: {best_score:.2f}")

    # Retrain the Final Production Model on ALL scaled data
    X_all = X_scaled
    y_all = y

    # *** This is where 'final_model' is defined! ***
    final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
    final_model.fit(X_all, y_all)
    print("‚úÖ Final Model Training Complete on ALL Data.")

    # --- D. Save and Download Files ---
    print("\n--- D. Save and Download Files ---")

    # 1. Save the Trained Model
    model_filename = 'final_parkinsons_regressor.pkl'
    joblib.dump(final_model, model_filename)

    # 2. Save the Scaler
    scaler_filename = 'parkinsons_scaler.pkl'
    joblib.dump(scaler, scaler_filename)

    print(f"‚úÖ Model saved as: {model_filename}")
    print(f"‚úÖ Scaler saved as: {scaler_filename}")

    # 3. Prompt Download
    print("\nüöÄ Preparing files for download...")
    files.download(model_filename)
    files.download(scaler_filename)
    print("üéâ Two files should now be downloading to your local machine.")

else:
    print("\nCannot proceed with saving due to data loading error in section A.")

--- A. Data Loading and Cleaning ---
--- B. Data Preparation ---
--- C. Hyperparameter Tuning and Training ---
Best Parameters Found: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best CV RMSE: 2.15
‚úÖ Final Model Training Complete on ALL Data.

--- D. Save and Download Files ---
‚úÖ Model saved as: final_parkinsons_regressor.pkl
‚úÖ Scaler saved as: parkinsons_scaler.pkl

üöÄ Preparing files for download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

üéâ Two files should now be downloading to your local machine.
