In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from joblib import Parallel, delayed
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [36]:
def load_and_preprocess():
    users = pd.read_csv('/content/drive/MyDrive/data/users.tsv', sep='\t')
    history = pd.read_csv('/content/drive/MyDrive/data/history.tsv', sep='\t')

    users['age'] = users['age'].replace(0, np.nan)
    users['city_id'] = users['city_id'].replace(0, np.nan)
    users['age'].fillna(users['age'].median(), inplace=True)
    users['city_id'].fillna(-1, inplace=True)  # -1 для неизвестных городов

    # Обработка истории показов
    history['hour'] = history['hour'] % 24
    history['day_part'] = pd.cut(history['hour'],
                               bins=[0, 6, 12, 18, 24],
                               labels=['night', 'morning', 'afternoon', 'evening'])

    # Добавляем временные признаки
    history['hour_sin'] = np.sin(2 * np.pi * history['hour']/24)
    history['hour_cos'] = np.cos(2 * np.pi * history['hour']/24)

    return users, history

In [50]:
class VKAdPredictor:
    def __init__(self, users, history):
        self.users = users
        self.history = history
        self.scaler = MinMaxScaler()
        self.models = {
            'at_least_one': None,
            'at_least_two': None,
            'at_least_three': None
        }
        self.sequence_length = 5
        self.feature_columns = None
        self.user_stats_cache = None

        # Оптимизированная предобработка данных
        self._optimized_preprocessing()

    def _optimized_preprocessing(self):
        """Оптимизированная предварительная обработка данных"""
        # Преобразование типов для экономии памяти
        self.users['age'] = self.users['age'].astype('float32')
        self.users['city_id'] = self.users['city_id'].astype('int32')
        self.history['cpm'] = self.history['cpm'].astype('float32')
        self.history['hour'] = self.history['hour'].astype('int8')

        # Предварительное вычисление статистик
        self._precompute_user_stats()

    def _precompute_user_stats(self):
        """Кэширование статистик пользователей с оптимизацией памяти"""
        stats = self.history.groupby('user_id').agg({
            'cpm': ['mean', 'max'],
            'hour': lambda x: ((x >= 8) & (x <= 20)).mean(),
            'publisher': 'count'
        }).reset_index()

        stats.columns = ['user_id', 'mean_cpm', 'max_cpm', 'daytime_ratio', 'impression_count']

        # Оптимизация типов данных
        stats['mean_cpm'] = stats['mean_cpm'].astype('float32')
        stats['max_cpm'] = stats['max_cpm'].astype('float32')
        stats['daytime_ratio'] = stats['daytime_ratio'].astype('float32')
        stats['impression_count'] = stats['impression_count'].astype('int32')

        self.user_stats_cache = self.users.merge(stats, on='user_id', how='left').fillna(0)

    def prepare_features(self, campaigns_df):
        """Оптимизированная подготовка признаков"""
        if self.feature_columns is None:
            sample_features = self._create_campaign_features(campaigns_df.iloc[0])
            self.feature_columns = list(sample_features.keys())
            self.scaler.fit(pd.DataFrame([sample_features]))

        # Векторизованная обработка
        features = pd.DataFrame(
            [self._create_campaign_features(campaign) for _, campaign in campaigns_df.iterrows()],
            columns=self.feature_columns
        )

        return pd.DataFrame(
            self.scaler.transform(features),
            columns=self.feature_columns
        )

    def _create_campaign_features(self, campaign):
        """Оптимизированное создание признаков для одной кампании"""
        features = {
            'campaign_cpm': float(campaign['cpm']),
            'duration_hours': int(campaign['hour_end']) - int(campaign['hour_start']),
            'publishers_count': len(str(campaign['publishers']).split(',')),
            'audience_size': int(campaign['audience_size'])
        }

        try:
            user_ids = list(map(int, campaign['user_ids'].split(',')))
        except:
            user_ids = []

        if user_ids:
            campaign_users = self.user_stats_cache[self.user_stats_cache['user_id'].isin(user_ids)]

            # Векторизованные вычисления
            for stat in ['age', 'city_id', 'mean_cpm', 'max_cpm', 'impression_count']:
                features[f'user_{stat}_mean'] = campaign_users[stat].mean()
                features[f'user_{stat}_std'] = campaign_users[stat].std()

            # Оптимизированная симуляция аукциона
            features.update(self._simulate_auction_fast(campaign, user_ids))
        else:
            for stat in ['age', 'city_id', 'mean_cpm', 'max_cpm', 'impression_count']:
                features[f'user_{stat}_mean'] = 0.0
                features[f'user_{stat}_std'] = 0.0
            features.update({
                'auction_win_prob_mean': 0.0,
                'auction_win_prob_std': 0.0
            })

        return features

    def _simulate_auction_fast(self, campaign, user_ids):
        """Векторизованная симуляция аукциона"""
        publishers = set(map(int, campaign['publishers'].split(',')))
        campaign_cpm = float(campaign['cpm'])

        user_stats = self.user_stats_cache[
            self.user_stats_cache['user_id'].isin(user_ids)
        ].set_index('user_id')

        max_cpms = user_stats['max_cpm']
        probs = np.where(
            campaign_cpm > max_cpms, 1.0,
            np.where(campaign_cpm == max_cpms, 0.5, 0.0)
        )

        return {
            'auction_win_prob_mean': float(np.mean(probs)),
            'auction_win_prob_std': float(np.std(probs))
        }

    def train(self, X, y):
        """Оптимизированное обучение моделей"""
        # Общие параметры для LightGBM
        common_params = {
            'objective': 'regression',
            'metric': 'mape',
            'verbosity': -1,
            'n_jobs': -1  # Используем все ядра
        }

        # Оптимизация гиперпараметров только один раз
        if not any(isinstance(m, lgb.LGBMRegressor) for m in self.models.values()):
            best_params = self.optimize_hyperparams(X, y['at_least_one'])
            best_params.update(common_params)

            for target in ['at_least_one', 'at_least_two']:
                self.models[target] = lgb.LGBMRegressor(**best_params).fit(X, y[target])

        # LSTM только если достаточно данных
        if len(X) > self.sequence_length and 'at_least_three' in self.models:
            X_seq, y_seq = self._prepare_sequences(X, y['at_least_three'])
            self.models['at_least_three'] = self._build_lstm_model(X_seq.shape[1:])

            # Упрощенный вызов fit без неподдерживаемых параметров
            self.models['at_least_three'].fit(
                X_seq,
                y_seq,
                epochs=10,
                batch_size=32,
                verbose=0
            )

    def optimize_hyperparams(self, X, y):
        """Оптимизация гиперпараметров с ограничением времени"""
        def objective(trial):
            params = {
                'num_leaves': trial.suggest_int('num_leaves', 20, 100),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
                'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
            }

            model = lgb.LGBMRegressor(**params)
            scores = []

            # Упрощенная кросс-валидация для ускорения
            for train_idx, test_idx in TimeSeriesSplit(2).split(X):
                model.fit(X.iloc[train_idx], y.iloc[train_idx])
                preds = model.predict(X.iloc[test_idx])
                scores.append(np.mean(np.abs(preds - y.iloc[test_idx]) / (y.iloc[test_idx] + 0.005)))

            return np.mean(scores)

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=20, timeout=600)  # Ограничение 10 минут
        return study.best_params

    def _prepare_sequences(self, X, y):
        """Подготовка последовательностей с проверкой размера"""
        if len(X) <= self.sequence_length:
            return np.empty((0, self.sequence_length, X.shape[1])), np.empty(0)

        X_seq = np.array([
            X.iloc[i:i+self.sequence_length].values
            for i in range(len(X) - self.sequence_length)
        ])
        y_seq = np.array([
            y.iloc[i+self.sequence_length]
            for i in range(len(X) - self.sequence_length)
        ])
        return X_seq, y_seq

    def _build_lstm_model(self, input_shape):
        """Упрощенная LSTM модель"""
        model = Sequential([
            LSTM(32, input_shape=input_shape),
            Dense(16, activation='relu'),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mape')
        return model

    def predict(self, campaign_data):
        """Стабильное предсказание с обработкой ошибок"""
        try:
            X = self.prepare_features(pd.DataFrame([campaign_data]))
        except Exception as e:
            print(f"Ошибка подготовки признаков: {e}")
            return {k: 0.0 for k in self.models.keys()}

        preds = {}
        for target, model in self.models.items():
            try:
                if model is None:
                    preds[target] = 0.0
                elif isinstance(model, Sequential):
                    if len(X) >= self.sequence_length:
                        X_seq = self._prepare_sequences(X, None)[0]
                        if len(X_seq) > 0:
                            pred = model.predict(X_seq[-1:], verbose=0)[0][0]
                        else:
                            pred = 0.0
                    else:
                        pred = 0.0
                    preds[target] = np.clip(pred, 0.0, 1.0)
                else:
                    preds[target] = np.clip(model.predict(X)[0], 0.0, 1.0)
            except Exception as e:
                print(f"Ошибка предсказания для {target}: {e}")
                preds[target] = 0.0

        return preds

In [52]:
users, history = load_and_preprocess()
validate = pd.read_csv('/content/drive/MyDrive/data/validate.tsv', sep='\t')
validate_answers = pd.read_csv('/content/drive/MyDrive/data/validate_answers.tsv', sep='\t')

# Инициализация и обучение модели
predictor = VKAdPredictor(users, history)
X_train = predictor.prepare_features(validate)
predictor.train(X_train, validate_answers)

# Пример прогнозирования
new_campaign = {
    'cpm': 200,
    'hour_start': 10,
    'hour_end': 18,
    'publishers': '1,2,3',
    'user_ids': '123,456,789',
    'audience_size': 3
}

prediction = predictor.predict(new_campaign)
print("Прогноз охвата:", prediction)

# Оценка на валидационных данных
val_predictions = []
for _, row in validate.iterrows():
    pred = predictor.predict(row)
    val_predictions.append(pred)

val_results = pd.DataFrame(val_predictions)
print("Примеры предсказаний:")
print(val_results.head())

[I 2025-05-19 16:45:05,597] A new study created in memory with name: no-name-ecaf850d-5925-4492-aebc-4e56bff08488
[I 2025-05-19 16:45:05,649] Trial 0 finished with value: 1.8916568200861128 and parameters: {'num_leaves': 68, 'learning_rate': 0.07181237532197836, 'feature_fraction': 0.9038504233835056, 'bagging_fraction': 0.7028248637904045, 'bagging_freq': 6, 'min_child_samples': 86}. Best is trial 0 with value: 1.8916568200861128.
[I 2025-05-19 16:45:05,750] Trial 1 finished with value: 1.4059358111270805 and parameters: {'num_leaves': 66, 'learning_rate': 0.11208158941368754, 'feature_fraction': 0.8517107275183424, 'bagging_fraction': 0.7856481341392726, 'bagging_freq': 6, 'min_child_samples': 29}. Best is trial 1 with value: 1.4059358111270805.
[I 2025-05-19 16:45:05,804] Trial 2 finished with value: 1.8977004790526286 and parameters: {'num_leaves': 98, 'learning_rate': 0.07071373213589996, 'feature_fraction': 0.9015252378472595, 'bagging_fraction': 0.8558855026024679, 'bagging_freq

Прогноз охвата: {'at_least_one': np.float64(0.02657287367117644), 'at_least_two': np.float64(0.028052040765976206), 'at_least_three': np.float64(0.0)}
Примеры предсказаний:
   at_least_one  at_least_two  at_least_three
0      0.044212      0.024381             0.0
1      0.000000      0.000000             0.0
2      0.087214      0.015748             0.0
3      0.210954      0.119290             0.0
4      0.383511      0.274947             0.0
