In [55]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score


def outliers_high(column: pd.Series, replace=np.nan):
    Q1, Q3 = np.percentile(column, [25, 75])
    IQR = Q3 - Q1
    High = Q3 + 3 * IQR
    return column.where(column.between(0, High), replace)


df = pd.read_csv("data_4.csv", sep=';', decimal=',', index_col=0)

df.dropna(axis=0, how='any', inplace=True)

col_outliers = ['удой', 'Жирность,%']
for col in col_outliers:
    df[col] = outliers_high(df[col])
    df[col] = df.groupby(['порода'], sort=False)[col].transform(lambda x: x.fillna(x.median()))

df['порода'] = df['порода'].replace("РефлешнСоверинггггг", "РефлешнСоверинг")
df['спо_кат'] = df['спо'].apply(lambda val: 1 if val > 0.9 else 0)

ohe = OneHotEncoder(sparse_output=False, drop='first')
df['порода'] = ohe.fit_transform(df[['порода']])

# print(df.info())
# print(df.describe())
# print(df.head())


df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)

X_cols = ['эке', 'протеин', 'порода', 'спо_кат']  # - Жирность
y_col = 'удой'

linear_reg = LinearRegression()
linear_reg.fit(df_train[X_cols], df_train[y_col])

y_pred = linear_reg.predict(df_test[X_cols])
y_true = df_test[y_col]

round_n = 2
print(round(r2_score(y_true, y_pred), round_n),
      round(mean_absolute_percentage_error(y_true, y_pred) * 100, round_n))
# 0.64 3.85


['эке', 'протеин', 'порода', 'спо_кат'] удой
0.64 3.85


In [48]:
df

Unnamed: 0_level_0,удой,эке,протеин,спо,порода,"Жирность,%",спо_кат
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,5863.0,14.2,1743.0,0.89,0.0,3.58,0
2,5529.0,12.8,2138.0,0.94,0.0,3.54,1
3,5810.0,14.0,1854.0,0.93,1.0,3.70,1
4,5895.0,12.4,2012.0,0.89,1.0,3.40,0
6,5254.0,12.7,1806.0,0.89,1.0,3.13,0
...,...,...,...,...,...,...,...
627,5970.0,14.4,1837.0,0.93,1.0,3.64,1
627,5970.0,14.4,1837.0,0.93,1.0,3.64,1
627,5970.0,14.4,1837.0,0.93,1.0,3.64,1
627,5970.0,14.4,1837.0,0.93,1.0,3.64,1
