In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn.datasets import load_iris

In [22]:
def ingest_data():
    iris_data = load_iris()
    data = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
    data['target'] = iris_data.target
    return data

In [23]:
def engineer_features(data):
    features = data.drop('target', axis=1)
    target = data['target']
    return features, target

In [24]:
# Hyperparameter Tuning using RandomizedSearchCV
def hyperparameter_tuning(X_train, y_train):
    param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],           # Number of trees in the forest
    'max_depth': [5, 10, 15, 20, None],                 # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],                    # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],                      # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None],             # 'auto' removed, valid options are 'sqrt', 'log2', or None
    'bootstrap': [True, False]                          # Whether bootstrap samples are used when building trees
}


    # Instantiate RandomForestClassifier
    rf = RandomForestClassifier()

    # RandomizedSearchCV
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    
    print(f"Best Hyperparameters: {rf_random.best_params_}")
    
    return rf_random.best_estimator_


In [25]:
# Model Training
def train_model(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Perform hyperparameter tuning
    model = hyperparameter_tuning(X_train, y_train)
    
    # Train the best model on the training data
    model.fit(X_train, y_train)
    
    # Predict and evaluate accuracy on the test set
    predictions = model.predict(X_test)
    print(f"Model Accuracy after tuning: {accuracy_score(y_test, predictions)}")
    
    return model


In [26]:
def deploy_model(model, model_path):
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

In [27]:
def monitor_model(model, features, target):
    predictions = model.predict(features)
    accuracy = accuracy_score(target, predictions)
    print(f"Model Monitoring Accuracy: {accuracy}")
    return accuracy
  
# Example usage
data = ingest_data()
features, target = engineer_features(data)
model = train_model(features, target)
deploy_model(model, 'model.pkl')

# Monitor model performance with new data (using the same dataset for simplicity)
monitor_model(model, features, target)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}
Model Accuracy after tuning: 1.0
Model saved to model.pkl
Model Monitoring Accuracy: 0.9733333333333334


0.9733333333333334