In [1]:
#import packages
import pandas as pd
import numpy as np
import requests
import io
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import TargetDriftPreset

In [2]:

# Load the dataset
df = pd.read_csv('auto_mpg.csv')

# Handle missing values in 'horsepower'
df = df.replace('?', np.nan)
df['horsepower'] = pd.to_numeric(df['horsepower'])

# Impute missing values with the median
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

# Drop the 'car name' column as it is not useful for prediction
df = df.drop('car name', axis=1)
      

In [3]:
df.shape

(398, 8)

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1


In [5]:
# Split the data into 'reference' and 'current' datasets.
# The 'reference' data represents the past, 'well-behaved' data.
# The 'current' data represents new, incoming data we need to monitor.
reference_data = df.iloc[:200].copy()
current_data = df.iloc[200:].copy()

In [6]:

#Introduce Data drift
print("Simulating data drift in the 'current' dataset...")

current_data['mpg'] = current_data['mpg'] * 0.8
current_data['cylinders'] = current_data['cylinders'] * 1.3
current_data['displacement'] = current_data['displacement'] * 0.6
current_data['horsepower'] = current_data['horsepower'] * 4.6
current_data['acceleration'] = current_data['acceleration'] * 1.45
current_data['weight'] = current_data['weight']*1.05
current_data['origin'] = current_data['origin'].replace({1: 2, 2: 3, 3: 1})

# One-hot encode the 'origin' column
reference_data = pd.get_dummies(reference_data, columns=['origin'], prefix='origin')
current_data = pd.get_dummies(current_data, columns=['origin'], prefix='origin')
print("Reference data shape:", reference_data.shape)
print("Current data shape:", current_data.shape)


Simulating data drift in the 'current' dataset...
Reference data shape: (200, 10)
Current data shape: (198, 10)


In [7]:
#Find out the datadrift
data_drift_report = Report(metrics=[
    DataDriftPreset()
])
data_drift_report.run(current_data=current_data, reference_data=reference_data, column_mapping=None)
 

In [8]:
# Save the report as an HTML file
report_filename = f'data_drift_report_ {datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
data_drift_report.save_html(report_filename)
print(f"Data Drift Report saved to {report_filename}")

Data Drift Report saved to data_drift_report_ 20250812_191711.html


In [9]:
#Fatures and Data
print("\nTraining a simple linear regression model...")

# Define features (X) and target (y)
X_ref = reference_data.drop('mpg', axis=1)
y_ref = reference_data['mpg']

#Data Cleaning
X_ref.dropna(inplace=True)
y_ref = y_ref.loc[X_ref.index]
 
#Model Building
model = LinearRegression()
model.fit(X_ref, y_ref)
model.score(X_ref, y_ref)


Training a simple linear regression model...


0.8518069326409302

In [10]:
# Save the model
model_filename = 'auto_mpg_linear_regr_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to auto_mpg_linear_regr_model.joblib


In [11]:

X_curr = current_data.drop('mpg', axis=1)
y_curr = current_data['mpg']
X_curr.dropna(inplace=True)

#Making predictions on reference and current data
reference_data['prediction'] = model.predict(X_ref)
current_data['prediction'] = model.predict(X_curr)

In [12]:
model.score(X_curr, y_curr)

0.4396019987386798

In [13]:
# Use Evidently to compare the distributions of the model predictions.
from evidently.metric_preset import TargetDriftPreset
 
print("\nGenerating Evidently Model Performance report...")
model_report = Report(metrics=[
    TargetDriftPreset()
])
model_report.run(current_data=current_data, reference_data=reference_data, column_mapping=None)


Generating Evidently Model Performance report...


In [14]:
# Save the report as an HTML file
model_report_filename = f'model_performance_report_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
model_report.save_html(model_report_filename)

In [15]:
def check_for_drift(report_json, threshold=0.2):
    """
    Checks the Evidently JSON report for data drift based on a simple threshold.
    Returns True if drift is detected, False otherwise.
    """
    data_drift_metrics = report_json.get('metrics', [])
    for metric in data_drift_metrics:
        if metric.get('metric') == 'ColumnDriftMetric':
            dataset_drift_score = metric.get('result', {}).get('drift_score')
            print(dataset_drift_score)
            if dataset_drift_score and dataset_drift_score > threshold:
                return True
    return False

In [16]:
def local_alert(drift_detected):
    """
    A simple local alerting function that prints a message to the console.
    """
    if drift_detected:
        print("\n!!! ALERT: Data Drift Detected !!!")
        print("Please check the generated HTML reports for a detailed analysis of the drift.")
    else:
        print("\nNo significant data drift detected. System is running smoothly.")
 
# Run the drift check and trigger the local alert
report_json_data = model_report.as_dict()
drift_detected = check_for_drift(report_json_data)
local_alert(drift_detected)

0.0002563942415315764

No significant data drift detected. System is running smoothly.
