In [10]:
# --- 1. Install Required Libraries ---
import subprocess
import sys
import os 
import zipfile 

try:
    # Check for all required libraries
    import numpy
    import pandas
    import matplotlib
    import seaborn
    import sklearn
    import tqdm
    import librosa
    import xgboost
    import joblib
    print("All required libraries are already installed.")
except ImportError:
    print("Installing required libraries...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "librosa", "xgboost", "scikit-learn", "tqdm", "seaborn", "pandas", "numpy", "matplotlib", "joblib"])

print("All libraries are ready.")

# --- 2. Import All Libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm 
import librosa
import xgboost 
import joblib 

print("All libraries imported successfully.")


# --- 3. Unzip Datasets (Recursive) ---

def find_file(filename, search_paths):
    for path in search_paths:
        full_path = os.path.join(path, filename)
        if os.path.exists(full_path):
            return full_path
    return None

search_locations = ['.', '..', '../backend', 'backend']

# 3.1 Dataset 1 (Original)
zip_name_1 = 'combined_dastaset_1.zip'
zip_path_1 = find_file(zip_name_1, search_locations)
data_dir_1 = 'parkinsons_multimodal_data'

if zip_path_1 and not os.path.exists(data_dir_1):
    print(f"Unzipping '{zip_name_1}'...")
    with zipfile.ZipFile(zip_path_1, 'r') as zip_ref:
        zip_ref.extractall(data_dir_1)
elif os.path.exists(data_dir_1):
    print(f"Dataset 1 '{data_dir_1}' already exists.")

# 3.2 Dataset 2 (audio_2.zip)
zip_name_2 = 'audio_2.zip'
zip_path_2 = find_file(zip_name_2, search_locations)
data_dir_2 = 'parkinsons_audio_data_2'

if zip_path_2:
    if not os.path.exists(data_dir_2):
        print(f"Unzipping '{zip_name_2}'...")
        try:
            with zipfile.ZipFile(zip_path_2, 'r') as zip_ref:
                zip_ref.extractall(data_dir_2)
            print(f"Dataset 2 unzipped successfully.")
            
            # --- RECURSIVE UNZIP START ---
            print("Scanning for nested zip files inside Dataset 2...")
            for root, dirs, files in os.walk(data_dir_2):
                for file in files:
                    if file.lower().endswith('.zip'):
                        nested_zip_path = os.path.join(root, file)
                        print(f"  Found nested zip: {file}. Unzipping...")
                        try:
                            with zipfile.ZipFile(nested_zip_path, 'r') as z:
                                z.extractall(root)
                            print(f"  ✅ Unzipped {file}")
                        except Exception as e:
                            print(f"  ❌ Failed to unzip {file}: {e}")
            # --- RECURSIVE UNZIP END ---
            
        except Exception as e:
            print(f"Error unzipping {zip_name_2}: {e}")
    else:
        print(f"Dataset 2 directory '{data_dir_2}' already exists.")
else:
    print(f"WARNING: '{zip_name_2}' not found in {search_locations}. Please ensure it is in your project folder.")

print("--- Data setup complete ---")


# --- 4. Load Audio File Paths and Labels ---
all_audio_paths = []
all_labels = []

# Load Dataset 1 (Known Structure)
print("\nLoading Dataset 1...")
ds1_base = os.path.join(data_dir_1, 'Parkinson Multi Model DATASET')
ds1_map = {
    "Healthy/AUDIO 1 HEALTHY": 0, "Healthy/AUDIO 2 HEALTHY": 0,
    "Unhealthy/AUDIO 1 UNHEALTHY": 1, "Unhealthy/AUDIO 2 UNHEALTHY": 1
}
initial_count = len(all_audio_paths)
for folder, label in ds1_map.items():
    path = os.path.join(ds1_base, folder)
    if os.path.exists(path):
        for f in os.listdir(path):
            if f.lower().endswith('.wav'):
                all_audio_paths.append(os.path.join(path, f))
                all_labels.append(label)
print(f"Added {len(all_audio_paths) - initial_count} files from Dataset 1.")

# Load Dataset 2 (Smart Scan)
print("\nLoading Dataset 2 (Scanning for files)...")
initial_count_ds2 = len(all_audio_paths)

if os.path.exists(data_dir_2):
    for root, dirs, files in os.walk(data_dir_2):
        wav_files = [f for f in files if f.lower().endswith('.wav')]
        if not wav_files: continue
            
        folder_name = os.path.basename(root).lower()
        label = None
        
        # Smart Labeling Keywords
        if any(x in folder_name for x in ['non', 'healthy', 'control', 'hc', 'normal']):
            label = 0
        elif any(x in folder_name for x in ['pd', 'parkinson', 'unhealthy', 'positive']):
            label = 1
            
        if label is not None:
            print(f"  Found {len(wav_files)} files in '{folder_name}' -> Labeled as {label}")
            for f in wav_files:
                all_audio_paths.append(os.path.join(root, f))
                all_labels.append(label)
else:
    print("Dataset 2 folder not found.")

count_ds2 = len(all_audio_paths) - initial_count_ds2
print(f"Added {count_ds2} files from Dataset 2.")

if len(all_audio_paths) == 0:
    print("\n❌ ERROR: No audio files found! Check your paths/zip files.")
else:
    print(f"\n✅ Total Valid Audio Files: {len(all_audio_paths)}")


# --- 5. Feature Extraction ---
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, duration=30, sr=None)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
        mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
        contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr).T, axis=0)
        spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr).T, axis=0)
        spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr).T, axis=0)
        rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr).T, axis=0)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y).T, axis=0)
        return np.concatenate((mfccs, chroma, mel, contrast, tonnetz, spec_cent, spec_bw, rolloff, zcr))
    except Exception as e:
        return None

print("\nStarting Feature Extraction...")
feature_list = []
label_list = []

for path, label in tqdm(zip(all_audio_paths, all_labels), total=len(all_audio_paths)):
    feats = extract_features(path)
    if feats is not None:
        feature_list.append(feats)
        label_list.append(label)

print(f"Extracted features for {len(feature_list)} files.")


# --- 6. Train & Save ---
if len(feature_list) > 0:
    X = np.array(feature_list)
    y = np.array(label_list)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("\nTraining Random Forest...")
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    acc = accuracy_score(y_test, model.predict(X_test_scaled))
    print(f"\n✅ New Model Accuracy: {acc*100:.2f}%")
    
    # SAVE TO CURRENT DIRECTORY with NEW NAME
    joblib.dump(model, 'audio_model.joblib')
    joblib.dump(scaler, 'audio_scaler.joblib')
    
    print("\n✅ SUCCESS! Models saved to current folder:")
    print("   - audio_model.joblib")
    print("   - audio_scaler.joblib")
    print("\nNEXT STEP: Run 'setup_models.py' to move these to your backend.")
else:
    print("❌ Extraction failed. No models saved.")

All required libraries are already installed.
All libraries are ready.
All libraries imported successfully.
Dataset 1 'parkinsons_multimodal_data' already exists.
Dataset 2 directory 'parkinsons_audio_data_2' already exists.
--- Data setup complete ---

Loading Dataset 1...
Added 73 files from Dataset 1.

Loading Dataset 2 (Scanning for files)...
  Found 41 files in 'hc_ah' -> Labeled as 0
  Found 40 files in 'pd_ah' -> Labeled as 1
Added 81 files from Dataset 2.

✅ Total Valid Audio Files: 154

Starting Feature Extraction...


100%|██████████| 154/154 [06:37<00:00,  2.58s/it]


Extracted features for 73 files.

Training Random Forest...

✅ New Model Accuracy: 80.00%

✅ SUCCESS! Models saved to current folder:
   - audio_model.joblib
   - audio_scaler.joblib

NEXT STEP: Run 'setup_models.py' to move these to your backend.
