In [1]:
import pandas as pd


In [2]:
import mlflow

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import mlflow.sklearn
from sklearn.metrics import classification_report,accuracy_score
import joblib

In [4]:
df=pd.read_csv(r"C:\Users\USER\OneDrive\Documents\ML projects\parkinson data\parkinsons_disease_data.csv")

In [5]:
df.duplicated().any().sum()

np.int64(0)

In [6]:
df.dropna(how="any",axis=0,inplace=True)

In [7]:
data=df.copy()

In [8]:
X=data.drop(columns=["name","status"],axis=1)
y=data['status']

In [9]:
# Set tracking URI
mlflow.set_tracking_uri("./mlruns")

def train_model(X, y):
    mlflow.set_experiment("parkison_complete")
    
    with mlflow.start_run() as run:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Log data information
        mlflow.log_param("train_samples", X_train.shape[0])
        mlflow.log_param("test_samples", X_test.shape[0])
        mlflow.log_param("features", X_train.shape[1])
        
        # Hyperparameter tuning
        param_grid = {
            "n_estimators": [50, 100, 150, 200],
            "max_depth": [5, 10, 15, 20],
            "min_samples_split": [2, 5, 10, 15],
            "max_features": ["sqrt", "log2"],
            "criterion": ["entropy", "gini"]
        }
        
        rf = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, scoring="accuracy")
        grid_search.fit(X_train, y_train)
        
        # Log the best params
        best_params = grid_search.best_params_ 
        for param, value in best_params.items():
            mlflow.log_param(param, value)
            
        best_model = grid_search.best_estimator_
        
        train_accuracy = best_model.score(X_train, y_train)
        test_accuracy = best_model.score(X_test, y_test)
        
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("cv_score", grid_search.best_score_)
        
        from mlflow.models.signature import infer_signature
        signature = infer_signature(X_train, best_model.predict(X_train))
        
        mlflow.sklearn.log_model(
            best_model,
            "random_forest_model",
            signature=signature,
            input_example=X_train[:5],
            registered_model_name="parkinson classifier"
        )
        
        # Log additional artifacts
        # Feature importance plot
        # Get feature names properly
        if hasattr(X, 'columns'):
            feature_names = X.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
            
        importances = best_model.feature_importances_
        
        plt.figure(figsize=(10, 6))
        plt.bar(range(len(feature_names)), importances)
        plt.xticks(range(len(feature_names)), feature_names, rotation=45)
        plt.title("Feature Importances")
        plt.xlabel("Features")
        plt.ylabel("Importance")
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        mlflow.log_artifact("feature_importance.png")
        plt.close()
        
        # Classification report
        y_pred = best_model.predict(X_test)
        report = classification_report(y_test, y_pred)
        with open("classification_report.txt", "w") as f:
            f.write(report)
        mlflow.log_artifact("classification_report.txt")
        
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Run ID: {run.info.run_id}")

if __name__ == "__main__":
    
    train_model(X, y)

Registered model 'parkinson classifier' already exists. Creating a new version of this model...
Created version '2' of model 'parkinson classifier'.


Test Accuracy: 0.9211
Run ID: 454aaea90ebb4f49b0fe499411948168
