In [5]:
pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [45]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
from google.colab import runtime
def exit_on_oom():
    print('Out of memory! Restarting runtime...')
    runtime.unassign()
%set_env PYTHONFAULTHANDLER=1

env: PYTHONFAULTHANDLER=1


In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
import gc

In [48]:
# Конфигурация типов данных
torch.set_default_dtype(torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [49]:
# Загрузка данных с контролем памяти
def load_data(path):
    df = pd.read_csv(path)
    number_cols = [
        'valence', 'year', 'acousticness', 'danceability', 'duration_ms',
        'energy', 'explicit', 'instrumentalness', 'key', 'liveness',
        'loudness', 'mode', 'popularity', 'speechiness', 'tempo'
    ]
    return df[['name', 'artists', 'year'] + number_cols].dropna().reset_index(drop=True)

data = load_data("/content/data.csv")
print(f"Loaded {len(data)} songs")

Loaded 170653 songs


In [50]:
# Класс датасета с контролем типов
class TypeSafeMusicDataset(Dataset):
    def __init__(self, data, top_k=20):
        super().__init__()
        self.data = data
        self.top_k = top_k
        self.node_features, self.edge_index = self._build_graph()

    def _build_graph(self):
        # Нормализация данных
        scaler = StandardScaler()
        features = scaler.fit_transform(self.data.iloc[:, 3:].values)

        # Поиск соседей
        nbrs = NearestNeighbors(n_neighbors=self.top_k+1, metric='cosine')
        nbrs.fit(features)
        _, indices = nbrs.kneighbors(features)

        # Создание edge_index
        rows = np.repeat(np.arange(len(indices)), self.top_k)
        cols = indices[:, 1:].flatten()

        # Создание тензоров с явным указанием типа
        features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
        edge_index = torch.tensor([rows, cols], dtype=torch.long).to(device)

        return features_tensor, edge_index

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        return Data(x=self.node_features, edge_index=self.edge_index)

In [51]:
class TypeSafeGCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()

        # Сохраняем параметры как атрибуты класса
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim

        # Инициализация слоев
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = x.float()
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)
"""
# Модель с явным контролем типов
class TypeSafeGCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim).to(device)
        self.conv2 = GCNConv(hidden_dim, output_dim).to(device)

        # Принудительная установка типа параметров
        for param in self.parameters():
            param.data = param.data.float()

    def forward(self, x, edge_index):
        x = x.float()  # Явное преобразование типа
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)"""

'\n# Модель с явным контролем типов\nclass TypeSafeGCN(nn.Module):\n    def __init__(self, input_dim, hidden_dim, output_dim):\n        super().__init__()\n        self.conv1 = GCNConv(input_dim, hidden_dim).to(device)\n        self.conv2 = GCNConv(hidden_dim, output_dim).to(device)\n        \n        # Принудительная установка типа параметров\n        for param in self.parameters():\n            param.data = param.data.float()\n\n    def forward(self, x, edge_index):\n        x = x.float()  # Явное преобразование типа\n        x = F.relu(self.conv1(x, edge_index))\n        return self.conv2(x, edge_index)'

In [52]:
# Инициализация
dataset = TypeSafeMusicDataset(data)
loader = DataLoader(dataset, batch_size=1)

model = TypeSafeGCN(
    input_dim=len(data.columns)-3,
    hidden_dim=64,
    output_dim=32).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [24]:
# Функция обучения с контролем типов
def train_safe():
    model.train()
    for epoch in range(30):
        optimizer.zero_grad()

        for batch in loader:
            batch = batch.to(device)

            # Явное преобразование типов
            x = batch.x.float()
            edge_index = batch.edge_index.long()

            # Forward pass
            out = model(x, edge_index)

            # Простой loss для демонстрации
            target = torch.rand_like(out).float().to(device)
            loss = F.mse_loss(out, target)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Очистка памяти
            del batch, x, edge_index, out, target
            torch.cuda.empty_cache()
            gc.collect()

        print(f'Epoch {epoch+1}/30, Loss: {loss.item():.4f}')

In [25]:
# Запуск обучения
train_safe()

Epoch 1/30, Loss: 0.7140
Epoch 2/30, Loss: 0.4755
Epoch 3/30, Loss: 0.3260
Epoch 4/30, Loss: 0.2396
Epoch 5/30, Loss: 0.1941
Epoch 6/30, Loss: 0.1731
Epoch 7/30, Loss: 0.1641
Epoch 8/30, Loss: 0.1594
Epoch 9/30, Loss: 0.1546
Epoch 10/30, Loss: 0.1484
Epoch 11/30, Loss: 0.1414
Epoch 12/30, Loss: 0.1346
Epoch 13/30, Loss: 0.1285
Epoch 14/30, Loss: 0.1235
Epoch 15/30, Loss: 0.1196
Epoch 16/30, Loss: 0.1168
Epoch 17/30, Loss: 0.1144
Epoch 18/30, Loss: 0.1126
Epoch 19/30, Loss: 0.1110
Epoch 20/30, Loss: 0.1095
Epoch 21/30, Loss: 0.1079
Epoch 22/30, Loss: 0.1067
Epoch 23/30, Loss: 0.1057
Epoch 24/30, Loss: 0.1046
Epoch 25/30, Loss: 0.1037
Epoch 26/30, Loss: 0.1029
Epoch 27/30, Loss: 0.1022
Epoch 28/30, Loss: 0.1015
Epoch 29/30, Loss: 0.1008
Epoch 30/30, Loss: 0.1001


In [26]:
# Функция рекомендаций
def safe_recommend(song_list, n=5):
    # Поиск индексов
    indices = []
    for song in song_list:
        match = data[data['name'] == song['name']]
        if not match.empty:
            indices.append(match.index[0])
        else:
            print(f"Song '{song['name']}' not found")
            return []

    # Получение эмбеддингов
    with torch.no_grad():
        model.eval()
        embeddings = model(dataset.node_features.float(), dataset.edge_index)

    # Расчет схожести
    input_emb = embeddings[indices].mean(dim=0)
    cos_sim = F.cosine_similarity(input_emb.unsqueeze(0), embeddings)

    # Фильтрация результатов
    top_indices = cos_sim.topk(n + len(indices))[1].cpu().numpy()
    recs = data.iloc[top_indices]
    recs = recs[~recs['name'].isin([s['name'] for s in song_list])]

    return recs.head(n)[['name', 'artists', 'year']]

In [44]:
# Тестовый пример
recommendations = safe_recommend([{'name': 'Break Stuff', 'artists': 'Limp Bizkit'}], n=5)

print("\nRecommendations:")
print(recommendations)


Recommendations:
                    name                    artists  year  year
169550        Easy Rider         ['Action Bronson']  2015  2015
16049   The Taste of Ink               ['The Used']  2002  2002
13608     Man in the Box        ['Alice In Chains']  1990  1990
38059               Wish  ['Diplo', 'Trippie Redd']  2018  2018


In [37]:
# Сохранение
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'hyperparams': {'input_dim': len(feature_columns), ...}
}, 'model.pth')

# Загрузка
checkpoint = torch.load('model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

SyntaxError: ':' expected after dictionary key (<ipython-input-37-00d5eb382c8d>, line 5)

In [41]:
def save_full_model(model, scaler, feature_columns, filename='music_recommender.h5'):
    with h5py.File(filename, 'w') as f:
        # Сохраняем параметры модели как атрибуты
        model_info = f.create_group('model_info')
        model_info.attrs['input_dim'] = model.input_dim
        model_info.attrs['hidden_dim'] = model.hidden_dim
        model_info.attrs['output_dim'] = model.output_dim

        # Сохраняем веса модели
        weights_grp = f.create_group('model_weights')
        for name, param in model.named_parameters():
            weights_grp.create_dataset(name, data=param.cpu().detach().numpy())

        # Остальные части остаются без изменений
        scaler_grp = f.create_group('scaler')
        scaler_grp.create_dataset('mean_', data=scaler.mean_)
        scaler_grp.create_dataset('scale_', data=scaler.scale_)
        scaler_grp.create_dataset('var_', data=scaler.var_)
        scaler_grp.create_dataset('n_samples_seen_', data=scaler.n_samples_seen_)

        f.create_dataset('feature_columns', data=np.array(feature_columns, dtype='S'))

In [42]:
def load_full_model(filename, device):
    with h5py.File(filename, 'r') as f:
        # Получаем параметры модели
        input_dim = f['model_info'].attrs['input_dim']
        hidden_dim = f['model_info'].attrs['hidden_dim']
        output_dim = f['model_info'].attrs['output_dim']

        # Создаем экземпляр модели
        model = TypeSafeGCN(input_dim, hidden_dim, output_dim).to(device)

        # Загружаем веса
        for name, param in model.named_parameters():
            param.data = torch.tensor(f['model_weights'][name][...], device=device)

        # Загружаем остальные компоненты
        scaler = StandardScaler()
        scaler.mean_ = f['scaler/mean_'][...]
        scaler.scale_ = f['scaler/scale_'][...]
        scaler.var_ = f['scaler/var_'][...]
        scaler.n_samples_seen_ = f['scaler/n_samples_seen_'][()]

        feature_columns = [col.decode('utf-8') for col in f['feature_columns'][...]]

    return model, scaler, feature_columns

In [43]:
# Тест сохранения и загрузки
model = TypeSafeGCN(15, 64, 32).to(device)
save_full_model(model, scaler, feature_columns, 'test_model.h5')
loaded_model, _, _ = load_full_model('test_model.h5', device)

# Проверка атрибутов
print(loaded_model.input_dim)  # Должно вывести 15
print(loaded_model.hidden_dim) # Должно вывести 64
print(loaded_model.output_dim) # Должно вывести 32

15
64
32
