In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer

class ModelPipeline:
    def __init__(self, train_path, test_path, model_path, submission_path='submission.csv'):
        self.train_path = train_path
        self.test_path = test_path
        self.model_path = model_path
        self.submission_path = submission_path
        self.imputer = SimpleImputer(strategy='mean')
        self.model = None

    def load_data(self):
        self.train_data = pd.read_parquet(self.train_path)
        self.test_data = pd.read_parquet(self.test_path)

    def format_data(self):
    
        expanded_data = self.train_data.apply(lambda row: pd.DataFrame({
            'id': row['id'],
            'dates': row['dates'],
            'values': row['values'],
            'label': row['label']
        }), axis=1)
        self.expanded_data = pd.concat(expanded_data.values, ignore_index=True)
        self.expanded_data['dates'] = pd.to_datetime(self.expanded_data['dates'], errors='coerce')

        expanded_test_data = self.test_data.apply(lambda row: pd.DataFrame({
            'id': row['id'],
            'dates': row['dates'],
            'values': row['values']
        }), axis=1)
        self.expanded_test_data = pd.concat(expanded_test_data.values, ignore_index=True)
        self.expanded_test_data['dates'] = pd.to_datetime(self.expanded_test_data['dates'], errors='coerce')

    def apply_filters(self, group):
        group['values_ma'] = group['values'].rolling(window=5, min_periods=1).mean()
        group['values_median'] = group['values'].rolling(window=5, min_periods=1).median()
        return group

    def extract_features(self, group):
        fft_values = np.fft.fft(group['values'].values)
        fft_magnitude = np.abs(fft_values)[:5]  
        features = {
            'mean_ma': group['values_ma'].mean(),
            'std_ma': group['values_ma'].std(),
            'min_ma': group['values_ma'].min(),
            'max_ma': group['values_ma'].max(),
            'median_ma': group['values_ma'].median(),
            'first_ma': group['values_ma'].iloc[0],
            'last_ma': group['values_ma'].iloc[-1],
            'trend_ma': (group['values_ma'].iloc[-1] - group['values_ma'].iloc[0]) / len(group),

            'mean_median': group['values_median'].mean(),
            'std_median': group['values_median'].std(),
            'min_median': group['values_median'].min(),
            'max_median': group['values_median'].max(),
            'median_median': group['values_median'].median(),
            'first_median': group['values_median'].iloc[0],
            'last_median': group['values_median'].iloc[-1],
            'trend_median': (group['values_median'].iloc[-1] - group['values_median'].iloc[0]) / len(group),

            'lag_1': group['values'].shift(1).iloc[-1],
            'lag_2': group['values'].shift(2).iloc[-1],
            'lag_3': group['values'].shift(3).iloc[-1],

            'fft_1': fft_magnitude[0],
            'fft_2': fft_magnitude[1],
            'fft_3': fft_magnitude[2],
            'fft_4': fft_magnitude[3],
            'fft_5': fft_magnitude[4],


            '25_percentile': np.percentile(group['values'], 25),
            '75_percentile': np.percentile(group['values'], 75),
            'variance': group['values'].var(),
            'autocorr_lag_1': group['values'].autocorr(lag=1)
        }
        return pd.Series(features)

    def process_data(self):
        expanded_data_filtered = self.expanded_data.groupby('id').apply(self.apply_filters).reset_index(drop=True)
        expanded_test_data_filtered = self.expanded_test_data.groupby('id').apply(self.apply_filters).reset_index(drop=True)


        features_data = expanded_data_filtered.groupby('id').apply(self.extract_features).reset_index()
        self.features_data = features_data.merge(self.expanded_data[['id', 'label']].drop_duplicates(), on='id')


        self.features_test_data = expanded_test_data_filtered.groupby('id').apply(self.extract_features).reset_index()

    def prepare_data(self):
    
        X_train = self.features_data.drop(columns=['id', 'label'])
        y_train = self.features_data['label']
        X_test = self.features_test_data.drop(columns=['id'])

        self.X_train_imputed = self.imputer.fit_transform(X_train)
        self.y_train = y_train
        self.X_test_imputed = self.imputer.transform(X_test)

    def load_model(self):
        self.model = joblib.load(self.model_path)

    def predict(self):
        y_test_proba = self.model.predict_proba(self.X_test_imputed)[:, 1]
        self.submission = pd.DataFrame({'id': self.features_test_data['id'], 'score': y_test_proba})

    def save_submission(self):

        self.submission.to_csv(self.submission_path, index=False)
        print(f"Файл {self.submission_path} успешно создан")

    def run_pipeline(self):
        self.load_data()
        self.format_data()
        self.process_data()
        self.prepare_data()
        self.load_model()
        self.predict()
        self.save_submission()

if __name__ == "__main__":
    pipeline = ModelPipeline(train_path='train.parquet', test_path='test.parquet', model_path='voting_classifier_model.pkl')
    pipeline.run_pipeline()


Файл submission.csv успешно создан
