In [193]:
%%writefile generate_data.py

import numpy as np
import pandas as pd


def create_data(dataset_size: int = 100, noise_percent: int = 0) -> pd.DataFrame:
    x = np.linspace(0, 10, dataset_size)
    y = x + np.random.random(dataset_size) * 2 - 1
    if noise_percent > 0:
        slice_size = int(dataset_size * (noise_percent / 100))
        noise_start_index = np.random.randint(0, dataset_size - slice_size + 1)
        y[noise_start_index:noise_start_index+slice_size] *= 3
    df = pd.DataFrame({'x': x, 'y': y})
    df.round(0)
    return df

if __name__ == "__main__":
    data_1 = create_data(dataset_size=2000)
    data_2 = create_data(dataset_size=200, noise_percent=2)
    data_3 = create_data(dataset_size=200, noise_percent=10)
    data_1.to_csv("data-train-1.csv", index=False)
    data_2.to_csv("data-test-2.csv", index=False)
    data_3.to_csv("data-test-3.csv", index=False)

Writing generate_data.py


In [194]:
%%writefile create_model.py

import pickle
import pandas as pd
from sklearn.linear_model import Ridge

data = pd.read_csv('data-train-1.csv')
model = Ridge()
model.fit(data[['x']], data[['y']])

pickle.dump(model, open('model.pth', 'wb'))

Writing create_model.py


In [195]:
%%writefile test_model.py

import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt


def score_model(data_path, model_path="model.pth"):
    model = pickle.load(open(model_path, "rb"))
    data = pd.read_csv(data_path)
    predicted_values = model.predict(data[["x"]])
    residuals = data[["y"]] - predicted_values
    mse = mean_squared_error(data["y"], predicted_values)
    anomaly_indices = np.where(np.abs(residuals) > 2 * np.sqrt(mse))[0]
    print(
        f"Found {len(anomaly_indices)} abnormal targets in {data_path}: from index {anomaly_indices.min() if len(anomaly_indices) > 0 else 0} to {anomaly_indices.max() if len(anomaly_indices) > 0 else 0}"
    )
    return r2_score(data["y"], predicted_values)


def test_model_on_train():
    assert score_model("data-train-1.csv") > 0.8

def test_model_on_test_2():
    assert score_model("data-test-2.csv") > 0.8

def test_model_on_test_3():
    assert score_model("data-test-3.csv") > 0.8


Writing test_model.py


In [196]:
%run -i "generate_data.py"
%run -i "create_model.py"
!"pytest"

platform win32 -- Python 3.8.13, pytest-7.2.0, pluggy-1.0.0
rootdir: d:\Учеба\MLOps\mlops_hw_5
plugins: anyio-3.6.2
collected 3 items

test_model.py .FF                                                        [100%]

____________________________ test_model_on_test_2 _____________________________

    def test_model_on_test_2():
>       assert score_model("data-test-2.csv") > 0.8
E       AssertionError: assert 0.6671144060751548 > 0.8
E        +  where 0.6671144060751548 = score_model('data-test-2.csv')

test_model.py:26: AssertionError
---------------------------- Captured stdout call -----------------------------
Found 4 abnormal targets in data-test-2.csv: from index 163 to 166
____________________________ test_model_on_test_3 _____________________________

    def test_model_on_test_3():
>       assert score_model("data-test-3.csv") > 0.8
E       AssertionError: assert 0.3712560833540711 > 0.8
E        +  where 0.3712560833540711 = score_model('data-test-3.csv')

test_model.py:29: As