In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load

# Создание датасетов

In [None]:
!mkdir df_1 df_2 df_3 df_4

In [None]:
import pandas as pd
import numpy as np

def generate_data(n_obs=500, noise=False):
    # Генерация данных
    gender = pd.Categorical(np.random.randint(0, 2, size=n_obs))
    age = np.random.randint(17, 25, size=n_obs)
    hours_studied = np.random.randint(1, 10, size=n_obs)
    extracurricular_activities = pd.Categorical(np.random.randint(0, 2, size=n_obs)).astype(int)
    exam_score = 10 * hours_studied + 5 * extracurricular_activities + 5 * np.random.randn(n_obs)

    # Создание датафрейма
    data = pd.DataFrame({
        "пол": gender,
        "возраст": age,
        "часы_учебы": hours_studied,
        "внеучебные_занятия": extracurricular_activities,
        "баллы_за_экзамен": exam_score
    })

    # Добавление шума
    if noise:
        data["баллы_за_экзамен"] = data["баллы_за_экзамен"] + 10 * np.random.randn(n_obs)

    return data

# Пример использования
student_data = generate_data(n_obs=1000, noise=True)
print(student_data.head())

  пол  возраст  часы_учебы  внеучебные_занятия  баллы_за_экзамен
0   1       22           5                   1         58.041327
1   0       18           7                   0         72.448822
2   0       22           5                   1         54.513936
3   0       24           8                   1         91.005230
4   1       18           7                   0         69.710794


In [None]:
df_1 = generate_data()
df_2 = generate_data()
df_3 = generate_data()
df_4 = generate_data(noise=True)

df_1.to_csv("df_1/df.сsv", index=False)
df_2.to_csv("df_2/df.сsv", index=False)
df_3.to_csv("df_3/df.сsv", index=False)
df_4.to_csv("df_4/df.сsv", index=False)

# Preprocessing

In [None]:
def select_categorical(data):
    return (
        data.drop(columns="баллы_за_экзамен")
        .select_dtypes(include=object)
        .columns.to_list()
    )


def select_num(data):
    return (
        data.drop(columns="баллы_за_экзамен")
        .select_dtypes(exclude=object)
        .columns.to_list()
    )

In [None]:
cat_features = select_categorical(df_1)
num_features = select_num(df_1)

In [None]:
# Пайплайн для препроцессинга данных и выполнения прогноза
pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("encoder", OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False), cat_features),
            ("scaler", StandardScaler(), num_features)]),
    LinearRegression(),
)

In [None]:
X = df_1.drop(columns=["баллы_за_экзамен"])
y = df_1["баллы_за_экзамен"]

# Разделение на обучающий и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

print("Размер тренировочного набора данных (X):", X_train.shape)
print("Размер тестового набора данных (X):", X_test.shape)
print("Размер тренировочного набора данных (y):", y_train.shape)
print("Размер тестового набора данных (y):", y_test.shape)

Размер тренировочного набора данных (X): (425, 4)
Размер тестового набора данных (X): (75, 4)
Размер тренировочного набора данных (y): (425,)
Размер тестового набора данных (y): (75,)


In [None]:
# Инициализация и обучение модели
pipeline.fit(X_train, y_train)

# Сохранение обученной модели
dump(pipeline, "pipeline.joblib")

['pipeline.joblib']

In [None]:
# Прогнозирование на тестовом наборе данных
y_pred_df1 = pipeline.predict(X_test)

# Вычисление средней абсолютной ошибки и коэффициента детерминации
MSE = mean_squared_error(y_test, y_pred_df1)
r2 = r2_score(y_test, y_pred_df1)
print("Средняя абсолютная ошибка (MSE):", MSE)
print("Коэффициент детерминации (R^2):", r2)

Средняя абсолютная ошибка (MSE): 23.851051094085808
Коэффициент детерминации (R^2): 0.9689465920694503


# Создание файла для тестирования

In [None]:
%%writefile test_score.py
import numpy as np
import pandas as pd
import pytest
from joblib import load
from sklearn.metrics import mean_squared_error, r2_score

@pytest.fixture()
def load_pipeline():
    pipeline = load("/content/pipeline.joblib")
    return pipeline

@pytest.fixture()
def load_data_and_get_pred(load_pipeline):
    data = pd.read_csv("df.сsv")
    X = data.drop(columns="баллы_за_экзамен")
    y = data["баллы_за_экзамен"]
    y_pred = load_pipeline.predict(X)
    return y, y_pred

def test_mse(load_data_and_get_pred):
    y, y_pred = load_data_and_get_pred
    assert mean_squared_error(y, y_pred) < 10

def test_r2(load_data_and_get_pred):
    y, y_pred = load_data_and_get_pred
    assert r2_score(y, y_pred) > 0.9

Writing test_score.py


# Тестирование с различными данными

In [None]:
%cd df_1
!pytest -v /content/test_score.py
%cd /content

/content/df_1
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: anyio-3.7.1
collected 2 items                                                                                  [0m

../test_score.py::test_mse [32mPASSED[0m[32m                                                            [ 50%][0m
../test_score.py::test_r2 [32mPASSED[0m[32m                                                             [100%][0m

/content


In [None]:
%cd df_2
!pytest -v /content/test_score.py
%cd /content

/content/df_2
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: anyio-3.7.1
collected 2 items                                                                                  [0m

../test_score.py::test_mse [32mPASSED[0m[32m                                                            [ 50%][0m
../test_score.py::test_r2 [32mPASSED[0m[32m                                                             [100%][0m

/content


In [None]:
%cd df_3
!pytest -v /content/test_score.py
%cd /content

/content/df_3
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: anyio-3.7.1
collected 2 items                                                                                  [0m

../test_score.py::test_mse [32mPASSED[0m[32m                                                            [ 50%][0m
../test_score.py::test_r2 [32mPASSED[0m[32m                                                             [100%][0m

/content


In [None]:
%cd df_4
!pytest -v /content/test_score.py
%cd /content

/content/df_4
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: anyio-3.7.1
collected 2 items                                                                                  [0m

../test_score.py::test_mse [32mPASSED[0m[32m                                                            [ 50%][0m
../test_score.py::test_r2 [31mFAILED[0m[31m                                                             [100%][0m

[31m[1m_____________________________________________ test_r2 ______________________________________________[0m

load_data_and_get_pred = (0      82.522585
1      36.015559
2      52.952397
3      73.864353
4      52.225942
         ...    
495    17.62298...21216, 44.99372131, 59.19094011, 85.34869186,
       20.22131707, 64.97656886, 25.10819262, 84.25530094, 14.60729111]))

    [94mdef[39;49;00m [92mtest_r2[39;49;00m(load_data_and_get_pred):[90m[39;49;00m
        y, y_pred = load_data

**Датасеты 1,2,3 успешно прошли тестирование, а тест с "шумовым" датасетом (№4) провалился**

Ниже идут ячейки с кодом, которые были на паре 6.05

In [None]:
# xs = np.linspace(0,10,100)
# ys = 1.5 * xs + np.random.random(100)*2 - 1

# plt.scatter(xs, ys)
# plt.show()

In [None]:
# reg = LinearRegression()
# reg.fit(xs.reshape(-1, 1), ys)
# approx = reg.predict(xs.reshape(-1, 1))
# print("MSE: %.2f" % mean_squared_error(ys, approx))
# print("Coeff of determination: %.2f" % r2_score(ys, approx))
# print("Coefficients: ", reg.coef_)

# plt.scatter(xs, ys)
# plt.plot(xs.reshape(-1, 1), approx, color="red", linewidth=3)
# plt.show()
# np.savetxt("ys.csv", ys, delimiter=",")
# np.savetxt("approx.csv", approx, delimiter=",")

In [None]:
# %%writefile test.py
# import pytest

# def test_err():
#   assert 1 == 2

# def test_ok():
#   assert 1 == 1

In [None]:
# !pytest -v test.py

In [None]:
# !mkdir test_module

In [None]:
# %%writefile test_module/test_unit.py
# import unittest
# import numpy as np
# from sklearn.metrics import mean_squared_error, r2_score

# class TestMethods(unittest.TestCase):

#   def test_upper(self):
#     self.assertEqual('foo'.upper(), 'FOO2')

#   def test_mse(self):
#     ys = np.loadtxt("ys.csv")
#     approx = np.loadtxt("approx.csv")
#     self.assertTrue(mean_squared_error(ys, approx) < 1)

# if __name__ == '__main__':
#   unittest.main()

In [None]:
# !python -m unittest test_module/test_unit.py

In [None]:
# %%writefile test_score.py
# import pytest
# import numpy as np
# from sklearn.metrics import mean_squared_error, r2_score

# @pytest.fixture()
# def load_ys():
#   ys = np.loadtxt("ys.csv", delimiter=',')
#   return (ys)

# @pytest.fixture()
# def load_approx():
#   approx = np.loadtxt("approx.csv", delimiter=',')
#   return (approx)

# def test_mse(load_ys, load_approx):
#   assert mean_squared_error(load_ys, load_approx) < 0.5

# def test_r2(load_ys, load_approx):
#   assert r2_score(load_ys, load_approx) > 0.9

In [None]:
# !pytest -v test_score.py

In [None]:
# xs = np.linspace(0, 10, 100)
# ys = 1.5 * xs + np.random.random(100)*2 - 1
# ys[25:45] += 3

# approx = reg.predict(xs.reshape(-1, 1))
# print("MSE: %.2f" % mean_squared_error(ys, approx))
# print("Coeff of determination: %.2f" % r2_score(ys, approx))
# print("Coefficients: ", reg.coef_)

# plt.scatter(xs, ys)
# plt.plot(xs.reshape(-1, 1), approx, color="red", linewidth=3)
# plt.show()
# np.savetxt("ys.csv", ys, delimiter=",")
# np.savetxt("approx.csv", approx, delimiter=",")