In [6]:
import numpy as np
import pandas as pd
import pickle
from flask import Flask, request, jsonify
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import os

In [5]:
pip install Flask

Collecting Flask
  Using cached flask-3.1.0-py3-none-any.whl (102 kB)
Collecting Werkzeug>=3.1
  Using cached werkzeug-3.1.3-py3-none-any.whl (224 kB)
Collecting click>=8.1.3
  Using cached click-8.1.8-py3-none-any.whl (98 kB)
Collecting itsdangerous>=2.2
  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Collecting Jinja2>=3.1.2
  Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Collecting blinker>=1.9
  Using cached blinker-1.9.0-py3-none-any.whl (8.5 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl (15 kB)
Installing collected packages: MarkupSafe, Werkzeug, Jinja2, itsdangerous, click, blinker, Flask
Successfully installed Flask-3.1.0 Jinja2-3.1.6 MarkupSafe-3.0.2 Werkzeug-3.1.3 blinker-1.9.0 click-8.1.8 itsdangerous-2.2.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Test\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [7]:
# Step 1: Load dataset
# We are using the Iris dataset for simplicity. In real scenarios, you will load a dataset from a file.
data = load_iris()
X = data.data  # Features
y = data.target  # Target variable

In [8]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# Step 2: Split dataset into training and testing sets
# 80% training, 20% testing to evaluate model performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Step 3: Define models and parameter grids
# We define 3 different models with their respective hyperparameters for tuning.
models = {
    "RandomForest": (Pipeline([
        ('clf', RandomForestClassifier())
    ]), {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [None, 10, 20]
    }),
    "SVM": (Pipeline([
        ('scaler', StandardScaler()),  # StandardScaler is used to normalize features for SVM
        ('clf', SVC())
    ]), {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    }),
    "GradientBoosting": (Pipeline([
        ('clf', GradientBoostingClassifier())
    ]), {
        'clf__n_estimators': [50, 100, 200],
        'clf__learning_rate': [0.01, 0.1, 0.2]
    })
}

best_models = {}

In [15]:
# Step 4: Perform Grid Search to find the best parameters
# This step will take some time as it runs cross-validation on multiple hyperparameters.
for model_name, (pipeline, param_grid) in models.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")


Best parameters for RandomForest: {'clf__max_depth': 20, 'clf__n_estimators': 50}
Best parameters for SVM: {'clf__C': 0.1, 'clf__kernel': 'linear'}
Best parameters for GradientBoosting: {'clf__learning_rate': 0.01, 'clf__n_estimators': 50}


In [16]:
# Step 5: Evaluate all models to find the best performing one
best_model = None
best_score = 0

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {acc:.4f}")
    if acc > best_score:
        best_score = acc
        best_model = (model_name, model)

print(f"\nBest Model: {best_model[0]} with accuracy {best_score:.4f}")

Accuracy of RandomForest: 1.0000
Accuracy of SVM: 1.0000
Accuracy of GradientBoosting: 1.0000

Best Model: RandomForest with accuracy 1.0000


In [17]:
# Step 6: Save the best model to a pickle file
# This pickle file is used for model inference in Flask API.
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model[1], file)