In [2]:
# mpg_drift_analysis.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import pickle

from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# 1. Load Auto MPG dataset
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

df = pd.read_csv(url, sep='\s+', names=column_names, na_values='?')
df.dropna(inplace=True)
df.drop('car_name', axis=1, inplace=True)

# Map origin to categories
df['origin'] = df['origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})

X = df.drop('mpg', axis=1)
y = df['mpg']

# 2. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Define preprocessing pipelines
numeric_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
categorical_features = ['origin']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 4. Define model pipeline
model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# 5. Train pipeline
pipeline.fit(X_train, y_train)

# # 6. Save model pipeline
# with open('best_model_pipeline.bin', 'wb') as f_out:
#     pickle.dump(pipeline, f_out)

# print("Model saved to best_model_pipeline.bin")

# 7. Load model pipeline
# with open('best_model_pipeline.bin', 'rb') as f_in:
#     loaded_model = pickle.load(f_in)

# 8. Data drift check with evidently

# Create training reference dataframe (just features, before transformation)
train_reference = X_train.copy()
current_data = X_test.copy()

# Create evidently report for data drift
report = Report(metrics=[DataDriftPreset()])

# Run report with train_reference and current_data
report.run(reference_data=train_reference, current_data=current_data)

# Show report inline (in Jupyter)
report.show()

# Optionally save report as html file
report.save_html('data_drift_report.html')
print("Drift report saved as data_drift_report.html")


Drift report saved as data_drift_report.html


In [6]:
pip install evidently==0.6.7


Collecting evidently==0.6.7
  Downloading evidently-0.6.7-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 19.8 MB/s eta 0:00:01


Installing collected packages: evidently
  Attempting uninstall: evidently
    Found existing installation: evidently 0.7.12
    Uninstalling evidently-0.7.12:
      Successfully uninstalled evidently-0.7.12
Successfully installed evidently-0.6.7
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install num