In [None]:
!pip install -r "https://raw.githubusercontent.com/DSKI23/alzheimervalentin/refs/heads/main/requirements.txt"

In [None]:
from huggingface_hub import HfApi
from huggingface_hub import notebook_login, login
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import wandb

# Checking enviorment variables, so that the github actions can execute the file as well.

In [None]:

if 'state' in os.environ:
    state = int(os.environ['state'])
else:
    state = 42

np.random.seed(state)

if 'huggingface_token' in os.environ:
    login(token= os.environ['huggingface_token'])
else:
    notebook_login()

if 'wandb_api_key' in os.environ:
    wandb.login(key = os.environ['wandb_api_key'])
else:
    wandb.login()

# 1. load the dataset from huggingface

In [None]:
dataset = load_dataset("DS23-KI-Projekt/alzheimerdataset_split")
df = dataset['train'].to_pandas()
df.head()


# 2. Zielvariable und Features

In [None]:
def load_data():
    try:
        df = dataset["train"].to_pandas()
        target = "Alzheimer’s Diagnosis"
        features = [col for col in df.columns if col != target]
        return df[features], df[target]
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit(1)

def train_rf(X_train, X_test, y_train, y_test, n_estimators):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)  # JSON format for WandB
    cfm = confusion_matrix(y_test, y_pred).tolist()  # Convert to list for WandB logging

    return rf_model, accuracy, class_report, cfm

#  Main function
def main():
    # Initialize WandB
    wandb.init(entity="zhannalialko-dhbw-mosbach", project="alzheimer-rf", name="RF-Training")

    # Load dataset
    X, y = load_data()

    # Split dataset (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    best_model = None
    best_accuracy = 0

    #  Train over 10 epochs with increasing estimators
    for epoch in range(1, 11):
        n_estimators = epoch * 10  # Start with 10 trees, increase to 100

        rf_model, accuracy, class_report, cfm = train_rf(X_train, X_test, y_train, y_test, n_estimators)

        # Save best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = rf_model

        # Log results in WandB
        wandb.log({
            "epoch": epoch,
            "accuracy": accuracy,
            "n_estimators": n_estimators,
            "classification_report": class_report,
            "confusion_matrix": cfm
        })

        print(f"Epoch {epoch} | n_estimators={n_estimators} | Accuracy={accuracy:.4f}")

    #  Save the best model
    model_dir = "random_forest_alzheimer"
    os.makedirs(model_dir, exist_ok=True)

    with open(f"{model_dir}/best_rf_model.pkl", "wb") as file:
        pickle.dump(best_model, file)

    print(f"Best model saved with accuracy: {best_accuracy:.4f}")

    # Finish WandB logging
    wandb.finish()

    return best_model
best_model = main()

In [None]:
if not os.path.exists('random_forest_alzheimer'):
    os.makedirs('random_forest_alzheimer')

with open('random_forest_alzheimer.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
# Modell speichern .pkl-Datei
model_filename = "random_forest_alzheimer.pkl"

# Repository-Name auf Hugging Face (anpassen!)
hf_repo_name = "DS23-KI-Projekt/KI-Modell"

# Falls das Repository nicht existiert, erstelle es
api = HfApi()
api.create_repo(repo_id=hf_repo_name, exist_ok=True)

# Modell hochladen
api.upload_file(
    path_or_fileobj=model_filename,  # Die gespeicherte Modell-Datei
    path_in_repo=model_filename,  # Name im Hugging Face Repo
    repo_id=hf_repo_name,  # Dein Repository
    commit_message="Upload des Random Forest Modells für Alzheimer-Diagnose"
)


api.upload_file(
    path_or_fileobj=model_filename,  # Die gespeicherte Modell-Datei
    path_in_repo=model_filename,  # Name im Hugging Face Repo
    repo_id="DS23-KI-Projekt/alzheimers-screening-assistant",  # Dein Repository
    commit_message="Upload des Random Forest Modells für Alzheimer-Diagnose"
)

print(f" Modell hochgeladen auf Hugging Face: https://huggingface.co/{hf_repo_name}")
