**Note: Run servers on local instead of Colab**





In [1]:
!pip install --quiet flask flask-ngrok scikit-learn joblib



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import joblib

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)
joblib.dump(pipeline, 'model.joblib')


['model.joblib']

In [3]:
%%writefile app.py
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok
import joblib

app = Flask(__name__)
run_with_ngrok(app)
model = joblib.load('model.joblib')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    features = data['features']
    pred = model.predict([features])[0]
    return jsonify({'prediction': int(pred)})

@app.route('/health', methods=['GET'])
def health():
    return 'OK', 200

if __name__ == '__main__':
    app.run()


Writing app.py


In [2]:
!python app.ipynb


python: can't open file 'c:\\Purelogics_Bootcamp\\Pure_logics_Bootcamp\\Graded_lab\\app.ipynb': [Errno 2] No such file or directory


In [16]:
import requests
from sklearn.model_selection import train_test_split
payload = {'features': X_test[0].tolist()}
print(requests.post('http://127.0.0.1:5000/predict', json=payload).json())


{'prediction': 1}



## Exercise: Model Serialization & Deployment

### 1. Data Preparation  
- Load the Breast Cancer Wisconsin dataset with `sklearn.datasets.load_breast_cancer`  ([Save and Load Machine Learning Models in Python with scikit-learn](https://www.geeksforgeeks.org/save-and-load-machine-learning-models-in-python-with-scikit-learn/?utm_source=chatgpt.com)).  
- Split into train/test (80/20) and standardize features with `StandardScaler`  ([Save and Load Machine Learning Models in Python with scikit-learn](https://www.geeksforgeeks.org/save-and-load-machine-learning-models-in-python-with-scikit-learn/?utm_source=chatgpt.com)).  

### 2. Pipeline Construction  
- Build a scikit-learn `Pipeline` combining `StandardScaler` and `RandomForestClassifier(n_estimators=100, random_state=42)`  ([Model Serialization using pickle and joblib - Kaggle](https://www.kaggle.com/code/tasnimniger/model-serialization-using-pickle-and-joblib?utm_source=chatgpt.com)).  
- Fit the pipeline on the training data.  

### 3. Serialization  
- Save the fitted pipeline as `model_v1.pkl` using `pickle.dump(..., protocol=5)`  ([Save Machine Learning Model Using Pickle and Joblib](https://www.analyticsvidhya.com/blog/2021/08/quick-hacks-to-save-machine-learning-model-using-pickle-and-joblib/?utm_source=chatgpt.com)).  
- Save the same pipeline as `model_v1.joblib` with `joblib.dump(...)`  ([STEP 2: Model serialization and pickling | AI Planet (formerly DPhi)](https://aiplanet.com/learn/machine-learning-bootcamp/module-6-model-deployment/840/step-2-model-serialization-and-pickling?utm_source=chatgpt.com)).  

### 4. Version Control & Metadata  
- Retrain the pipeline twice more with different `random_state` values (e.g., 24, 2025) and save as `model_v2.[pkl|joblib]`, `model_v3.[pkl|joblib]`.  
- Create `model_metadata.json` capturing for each version: `{version, filename, saved_at}` in ISO format  ([Save and Load Machine Learning Models in Python with scikit-learn](https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/?utm_source=chatgpt.com)).  

### 5. Deserialization & Validation  
- Load each Pickle and Joblib file, run `.predict()` on the standardized test set, and compute accuracy with `sklearn.metrics.accuracy_score`.  
- Verify that all versions produce expected accuracy and identical predictions across formats  ([Save Machine Learning Model Using Pickle and Joblib](https://www.analyticsvidhya.com/blog/2021/08/quick-hacks-to-save-machine-learning-model-using-pickle-and-joblib/?utm_source=chatgpt.com)).  

### 6. Benchmarking  
- Measure and compare file sizes (`os.path.getsize`) and load times (`time.time()` deltas) for Pickle vs Joblib  ([Use Of Pickle & Joblib To Dump And Load Machine Learning Model](https://www.youtube.com/watch?v=N4HO1HDuK4o&utm_source=chatgpt.com)).  
- Plot or tabulate the results in your notebook.  

### 7. (Bonus) Minimal REST API  
- Implement a Flask app (`app.py`) exposing two endpoints:  
  - `GET /models` → returns `model_metadata.json`.  
  - `POST /predict` → accepts JSON `{"features": […]}`, loads the **latest** model version, and returns `{"prediction": <class>}`  ([How to Save Trained Model in Python - Neptune.ai](https://neptune.ai/blog/saving-trained-model-in-python?utm_source=chatgpt.com)).  
- Demonstrate usage via Python’s `requests` in Colab.  

### Deliverables  
1. A Colab notebook with all code cells.  
2. Saved model files (`.pkl`, `.joblib`) and `model_metadata.json`.  
3. A brief markdown report comparing Pickle vs Joblib (size, speed).  
4. (Optional) Flask app code and sample requests.

In [17]:
#Data Preparation
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
#Pipeline Construction
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))])
pipeline.fit(X_train, y_train)

In [None]:
# Serialization (Pickle & Joblib)
import pickle
import joblib

with open('model_v1.pkl', 'wb') as f:
    pickle.dump(pipeline, f, protocol=5)
joblib.dump(pipeline, 'model_v1.joblib')

['model_v1.joblib']

In [None]:
# Version Control & Metadata
from datetime import datetime
import json

model_metadata = []

for i, rs in enumerate([24, 2025], start=2):
    version = f'v{i}'
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=rs))
    ])
    pipeline.fit(X_train, y_train)

  
    pkl_name = f'model_{version}.pkl'
    joblib_name = f'model_{version}.joblib'
    with open(pkl_name, 'wb') as f:
        pickle.dump(pipeline, f, protocol=5)
    joblib.dump(pipeline, joblib_name)

    now = datetime.now().isoformat()
    model_metadata.append({'version': version, 'filename': pkl_name, 'saved_at': now})
    model_metadata.append({'version': version, 'filename': joblib_name, 'saved_at': now})

with open('model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

In [None]:
# Deserialization & Validation
from sklearn.metrics import accuracy_score

results = {}

for version in ['v1', 'v2', 'v3']:
    with open(f'model_{version}.pkl', 'rb') as f:
        model_pkl = pickle.load(f)
    model_joblib = joblib.load(f'model_{version}.joblib')

    pred_pkl = model_pkl.predict(X_test)
    pred_joblib = model_joblib.predict(X_test)

    acc_pkl = accuracy_score(y_test, pred_pkl)
    acc_joblib = accuracy_score(y_test, pred_joblib)

    results[version] = {
        'accuracy_pickle': acc_pkl,
        'accuracy_joblib': acc_joblib,
        'identical_preds': (pred_pkl == pred_joblib).all()
    }

results

{'v1': {'accuracy_pickle': 0.9649122807017544,
  'accuracy_joblib': 0.9649122807017544,
  'identical_preds': np.True_},
 'v2': {'accuracy_pickle': 0.9649122807017544,
  'accuracy_joblib': 0.9649122807017544,
  'identical_preds': np.True_},
 'v3': {'accuracy_pickle': 0.956140350877193,
  'accuracy_joblib': 0.956140350877193,
  'identical_preds': np.True_}}

In [15]:
# Benchmarking
import os
import time

benchmark = []

for version in ['v1', 'v2', 'v3']:
    for ext in ['pkl', 'joblib']:
        filename = f'model_{version}.{ext}'

        size = os.path.getsize(filename)

        start = time.time()
        if ext == 'pkl':
            with open(filename, 'rb') as f:
                _ = pickle.load(f)
        else:
            _ = joblib.load(filename)
        end = time.time()

        benchmark.append({
            'version': version,
            'format': ext,
            'size_kb': size / 1024,
            'load_time_s': round(end - start, 4)
        })

import pandas as pd
pd.DataFrame(benchmark)

Unnamed: 0,version,format,size_kb,load_time_s
0,v1,pkl,308.25293,0.0191
1,v1,joblib,318.783203,0.1139
2,v2,pkl,314.81543,0.0429
3,v2,joblib,325.345703,0.0906
4,v3,pkl,316.378906,0.0093
5,v3,joblib,326.908203,0.0657


In [20]:
# Save as app.py
from flask import Flask, request, jsonify
import joblib
import json
import os

app = Flask(__name__)

@app.route('/models', methods=['GET'])
def models():
    with open('model_metadata.json') as f:
        data = json.load(f)
    return jsonify(data)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    features = data['features']

    # Load latest model
    latest = sorted(
        (f for f in os.listdir() if f.startswith('model_v3') and f.endswith('.joblib')),
        reverse=True
    )[0]
    model = joblib.load(latest)
    pred = model.predict([features])[0]
    return jsonify({'prediction': int(pred)})

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [13]:
import requests

sample = X_test[0].tolist()
response = requests.post("http://127.0.0.1:5000/predict", json={"features": sample})
print(response.json())

{'prediction': 1}
