# Лабораторная работа 7

## 1. Анализ и предобработка.

Загрузка

In [31]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

movies = pd.read_csv("mkrf_movies.csv")
shows = pd.read_csv("mkrf_shows.csv")


Приведение типов,  Объединение по puNumber и обрабтка дат

In [32]:
movies["puNumber"] = movies["puNumber"].astype(str)
shows["puNumber"] = shows["puNumber"].astype(str)

# === 2. ===
df = movies.merge(shows, on="puNumber", how="left")
df["box_office"] = df["box_office"].fillna(0)

# === 3. Заполнение пропусков ===
text_cols = ["film_studio", "director", "producer", "genres"]
for col in text_cols:
    df[col] = df[col].fillna("unknown")

num_cols = ["budget", "refundable_support", "nonrefundable_support"]
for col in num_cols:
    df[col] = df[col].fillna(0)

# === 4. Обработка дат ===
df["show_start_date"] = pd.to_datetime(df["show_start_date"], errors="coerce")
df["year"] = df["show_start_date"].dt.year.fillna(0).astype(int)
df["month"] = df["show_start_date"].dt.month.fillna(0).astype(int)

Преобразование жанров в мульти-хот код

In [33]:
df["genres"] = df["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else ["unknown"])
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df["genres"])
genres_df = pd.DataFrame(genres_encoded, columns=[f"genre_{g}" for g in mlb.classes_])
df = pd.concat([df, genres_df], axis=1)

One-Hot Encoding для категориальных признаков

In [34]:
df = pd.get_dummies(df, columns=["type", "age_restriction", "financing_source"], drop_first=True)

Обработка целевой переменной

In [35]:
df["ratings"] = pd.to_numeric(df["ratings"], errors="coerce")
df = df.dropna(subset=["ratings"])

Масштабирование числовых признаков

In [36]:
scaler = StandardScaler()
scale_cols = ["budget", "refundable_support", "nonrefundable_support", "box_office"]
df[scale_cols] = scaler.fit_transform(df[scale_cols])

Проверяю и принчу всё подряд

In [37]:

print(df.info())

print("Пропуски")
print(df.isna().sum().sort_values(ascending=False).head(10))

print(df.head(3))

print("\n статиситка")
print(df[scale_cols + ["ratings"]].describe().T)

print("\nПризнаки после кодирования")
print(f"Количество жанровых колонок: {len([c for c in df.columns if c.startswith('genre_')])}")
print(f"Количество категориальных колонок: {len([c for c in df.columns if c.startswith(('type_', 'age_restriction_', 'financing_source_'))])}")
print(f"Итого признаков: {df.shape[1]} и строк: {df.shape[0]}")


<class 'pandas.core.frame.DataFrame'>
Index: 6490 entries, 0 to 7485
Data columns (total 58 columns):
 #   Column                                             Non-Null Count  Dtype              
---  ------                                             --------------  -----              
 0   title                                              6490 non-null   object             
 1   puNumber                                           6490 non-null   object             
 2   show_start_date                                    6490 non-null   datetime64[ns, UTC]
 3   film_studio                                        6490 non-null   object             
 4   production_country                                 6488 non-null   object             
 5   director                                           6490 non-null   object             
 6   producer                                           6490 non-null   object             
 7   refundable_support                                 6490 non-null 

## 2. Построение baseline-модели нейронной сети

Формируем матрицу X и целевую переменную y

In [38]:
# Формируем данные
target = "ratings"

# Исключаем ненужные текстовые столбцы
drop_cols = ["puNumber", "show_start_date", "director", "producer", "film_studio", "genres"]

X = df.drop(columns=drop_cols + [target]).values
y = df[target].values.reshape(-1, 1)

print("Форма X:", X.shape)
print("Форма y:", y.shape)


Форма X: (6490, 51)
Форма y: (6490, 1)


Train / Test split

In [39]:
import numpy as np

# Берем только числовые признаки
num_df = df.select_dtypes(include=["float64", "int64"])

X = num_df.drop(columns=[target]).values
y = num_df[target].values.reshape(-1, 1)

print("X shape:", X.shape)
print("y shape:", y.shape)

# 3. train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Преобразуем в numpy float32
X_train = np.array(X_train, dtype=np.float32)
X_test  = np.array(X_test, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
y_test  = np.array(y_test, dtype=np.float32)


X shape: (6490, 36)
y shape: (6490, 1)


Реализация нейронной сети (numpy)

In [40]:
import numpy as np

# ------------------------
# ACTIVATIONS
# ------------------------

class ReLU:
    def forward(self, x):
        self.x = x
        return np.maximum(0, x)
    
    def backward(self, grad):
        return grad * (self.x > 0)


class Linear:
    def __init__(self, in_f, out_f):
        self.W = np.random.randn(in_f, out_f) * 0.01
        self.b = np.zeros((1, out_f))

    def forward(self, x):
        self.x = x
        return x @ self.W + self.b

    def backward(self, grad, lr):
        dW = self.x.T @ grad
        db = np.sum(grad, axis=0, keepdims=True)
        dx = grad @ self.W.T

        self.W -= lr * dW
        self.b -= lr * db

        return dx


# ------------------------
# LOSS — MSE (регрессия)
# ------------------------

class MSELoss:
    def forward(self, pred, target):
        self.pred = pred
        self.target = target
        return np.mean((pred - target) ** 2)

    def backward(self):
        return 2 * (self.pred - self.target) / len(self.pred)


# ------------------------
# MODEL
# ------------------------

class SimpleRegressor:
    def __init__(self, input_dim, h1, h2):
        self.l1 = Linear(input_dim, h1)
        self.a1 = ReLU()
        self.l2 = Linear(h1, h2)
        self.a2 = ReLU()
        self.l3 = Linear(h2, 1)
        self.loss_fn = MSELoss()

    def forward(self, x):
        out = self.l1.forward(x)
        out = self.a1.forward(out)
        out = self.l2.forward(out)
        out = self.a2.forward(out)
        out = self.l3.forward(out)
        return out

    def backward(self, grad, lr):
        grad = self.l3.backward(grad, lr)
        grad = self.a2.backward(grad)
        grad = self.l2.backward(grad, lr)
        grad = self.a1.backward(grad)
        grad = self.l1.backward(grad, lr)


# ------------------------
# TRAIN LOOP
# ------------------------

def train(model, X, y, epochs=200, lr=0.001):
    losses = []
    for i in range(epochs):
        out = model.forward(X)
        loss = model.loss_fn.forward(out, y)
        grad = model.loss_fn.backward()
        model.backward(grad, lr)

        losses.append(loss)
        if i % 20 == 0:
            print(f"Epoch {i} | loss={loss:.4f}")
    return losses


## Обучение

In [41]:
model = SimpleRegressor(
    input_dim=X_train.shape[1],
    h1=64,
    h2=32
)

losses = train(model, X_train, y_train, epochs=300, lr=0.001)


Epoch 0 | loss=42.8851
Epoch 20 | loss=40.4610
Epoch 40 | loss=37.4419
Epoch 60 | loss=34.6551
Epoch 80 | loss=32.0827
Epoch 100 | loss=29.7083
Epoch 120 | loss=27.5165
Epoch 140 | loss=25.4932
Epoch 160 | loss=23.6256
Epoch 180 | loss=21.9015
Epoch 200 | loss=20.3099
Epoch 220 | loss=18.8405
Epoch 240 | loss=17.4838
Epoch 260 | loss=16.2312
Epoch 280 | loss=15.0745


## Результат

In [42]:
pred = model.forward(X_test)
mse = np.mean((pred - y_test) ** 2)
print("TEST MSE:", mse)


TEST MSE: 13.738407243738711


In [43]:
configs = [
    (64, 32),
    (128, 64),
    (32, 16)
]

for h1, h2 in configs:
    print(f"\nTesting architecture: {h1} → {h2}")
    model = SimpleRegressor(X_train.shape[1], h1, h2)
    train(model, X_train, y_train, epochs=200, lr=0.001)
    pred = model.forward(X_test)
    print("Test MSE:", np.mean((pred - y_test)**2))



Testing architecture: 64 → 32
Epoch 0 | loss=42.9318
Epoch 20 | loss=41.0138
Epoch 40 | loss=37.9522
Epoch 60 | loss=35.1262
Epoch 80 | loss=32.5177
Epoch 100 | loss=30.1100
Epoch 120 | loss=27.8875
Epoch 140 | loss=25.8361
Epoch 160 | loss=23.9426
Epoch 180 | loss=22.1947
Test MSE: 20.272070946713495

Testing architecture: 128 → 64
Epoch 0 | loss=42.1624
Epoch 20 | loss=40.8119
Epoch 40 | loss=37.7629
Epoch 60 | loss=34.9477
Epoch 80 | loss=32.3478
Epoch 100 | loss=29.9458
Epoch 120 | loss=27.7257
Epoch 140 | loss=25.6725
Epoch 160 | loss=23.7722
Epoch 180 | loss=22.0114
Test MSE: 20.06944883696668

Testing architecture: 32 → 16
Epoch 0 | loss=44.0264
Epoch 20 | loss=40.2778
Epoch 40 | loss=37.2722
Epoch 60 | loss=34.4978
Epoch 80 | loss=31.9365
Epoch 100 | loss=29.5718
Epoch 120 | loss=27.3885
Epoch 140 | loss=25.3723
Epoch 160 | loss=23.5102
Epoch 180 | loss=21.7899
Test MSE: 19.892958873885057
