In [2]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.0 (from mlflow)
  Downloading mlflow_skinny-2.20.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.0->mlflow)
  Downloading databricks_sdk-0.41.0-py3-none-any.whl.metadata (38 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.20.0->mlflow)
  Downloading opentelemetry_api-1.29.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==2.20.0->mlflow)
  Downloading open

In [3]:
import mlflow
print(mlflow.__version__)

2.20.0


In [6]:
import warnings
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from scipy.stats import randint, uniform
from datetime import datetime

warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv("Forest_fire.csv").to_numpy()

X = data[1:, 1:-1].astype('int')  # Features
y = data[1:, -1].astype('int')    # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models and their parameter distributions for RandomizedSearchCV
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'solver': ['lbfgs', 'liblinear'],
            'max_iter': randint(100, 500),  # Max iterations
            'C': uniform(0.1, 10.0)        # Regularization strength
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': randint(10, 200),       # Number of trees
            'max_depth': [None, 10, 20, 30],       # Maximum tree depth
            'criterion': ['gini', 'entropy'],      # Splitting criterion
            'min_samples_split': randint(2, 10),   # Minimum samples to split a node
            'min_samples_leaf': randint(1, 10)     # Minimum samples in a leaf
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': uniform(0.1, 10.0),       # Regularization parameter
            'kernel': ['linear', 'rbf'],   # Kernel type
            'gamma': ['scale', 'auto']     # Kernel coefficient
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': randint(3, 20),     # Number of neighbors
            'weights': ['uniform', 'distance']  # Weight function
        }
    }
}

# List to store model comparison results
results = []

# Iterate over each model, perform RandomizedSearchCV, and evaluate
for model_name, config in models.items():
    print(f"Training and tuning {model_name}...")
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=config['model'],
        param_distributions=config['params'],
        n_iter=50,                # Number of parameter settings sampled
        cv=5,                     # 5-fold cross-validation
        scoring='accuracy',       # Use accuracy as the scoring metric
        random_state=42,
        verbose=0                 # Suppress verbose output
    )
    
    random_search.fit(X_train, y_train)
    
    # Get the best model and its parameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    y_pred = best_model.predict(X_test)
    
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Best Parameters for {model_name}: {best_params}")
    print(f"Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
    
    # Store results for comparison
    results.append({
        'Model': model_name,
        'Best Model': best_model,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'F1 Score': f1
    })

# Create a DataFrame to summarize results
results_df = pd.DataFrame(results)
print("\nComparison of Models:")
print(results_df)

# Select the best model based on accuracy
best_result = max(results, key=lambda x: x['Accuracy'])
best_model_name = best_result['Model']
best_model = best_result['Best Model']
best_params = best_result['Best Parameters']
best_accuracy = best_result['Accuracy']
best_f1 = best_result['F1 Score']

print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_accuracy:.4f}, F1 Score: {best_f1:.4f}")

# Log the best model to MLflow
mlflow.set_experiment("Forest Fire Best Model")

with mlflow.start_run(run_name=f"Best_Model_{best_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Log best model parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", best_accuracy)
    mlflow.log_metric("f1_score", best_f1)
    mlflow.set_tag("Model", best_model_name)
    
    # Log the best model with signature and input example
    signature = infer_signature(X_train, best_model.predict(X_train))
    input_example = X_train[:5]
    
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path=f"{best_model_name}_model",
        signature=signature,
        input_example=input_example,
        registered_model_name="Forest_Fire_Best_Model"
    )

print("\nBest model logged to MLflow successfully!")

with open('model.pkl', 'wb') as model_file:
    pickle.dump(lr, model_file)

with open('model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)


Training and tuning LogisticRegression...
Best Parameters for LogisticRegression: {'C': 3.845401188473625, 'max_iter': 448, 'solver': 'lbfgs'}
Accuracy: 1.0000, F1-score: 1.0000
Training and tuning RandomForest...
Best Parameters for RandomForest: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 18}
Accuracy: 0.9167, F1-score: 0.9172
Training and tuning SVM...
Best Parameters for SVM: {'C': 3.845401188473625, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 1.0000, F1-score: 1.0000
Training and tuning KNN...
Best Parameters for KNN: {'n_neighbors': 4, 'weights': 'distance'}
Accuracy: 0.9167, F1-score: 0.9148

Comparison of Models:
                Model                                         Best Model  \
0  LogisticRegression  LogisticRegression(C=3.845401188473625, max_it...   
1        RandomForest  (DecisionTreeClassifier(criterion='entropy', m...   
2                 SVM          SVC(C=3.845401188473625, kernel='linear')   


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best model logged to MLflow successfully!


Registered model 'Forest_Fire_Best_Model' already exists. Creating a new version of this model...
Created version '2' of model 'Forest_Fire_Best_Model'.
