In [1]:
import sys
import os

import pandas as pd
import numpy as np
import torch
from glob import glob
from torch.utils.data import DataLoader

from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.category_size_clip import CategorySizeClip
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing import FeatureBinScaler
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from sklearn.model_selection import train_test_split

import pytorch_lightning as pl

import logging
import pickle

from itertools import groupby
from functools import reduce
from operator import iadd

from collections import defaultdict

from ptls.data_load.feature_dict import FeatureDict
from ptls.frames.coles.split_strategy import AbsSplit

from functools import partial
from ptls.nn import TrxEncoder
from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
from ptls.frames.coles.coles_module import CoLESModule
from ptls.frames.inference_module import InferenceModule

import warnings
warnings.filterwarnings("ignore")

from ptls.data_load.utils import collate_feature_dict
from tqdm.auto import tqdm
import lightgbm as ltb
import json

In [2]:
class LateFusion:
    def __init__(
        self,
        train1_path,
        test1_path,
        train2_path,
        test2_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train1_path = train1_path
        self.train2_path = train2_path
        self.test1_path = test1_path
        self.test2_path = test2_path
        self.col_id = col_id
        self.all_targets = targets
        self.drop_feat = list(self.all_targets) + [self.col_id]
        self.result_path = result_path
        self.params = params
        
    def fit(self):
        print('Read train data...')
        train_embeddings1 = pd.read_parquet(self.train1_path)
        train_embeddings2 = pd.read_parquet(self.train2_path).drop(columns=list(self.all_targets))
        
        print('Run train concate...')
        train_embeddings = train_embeddings1.merge(train_embeddings2, on=self.col_id, how='outer').fillna(0)
        del train_embeddings1
        del train_embeddings2
        
        X_train = train_embeddings.drop(columns=self.drop_feat)
        print('Start fit...')
        clfs = dict()
        for col_target in self.all_targets:
            clf = ltb.LGBMClassifier(**self.params)
            y_train = train_embeddings[col_target]
            clf.fit(X_train, y_train)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf
        return clfs
    
    def get_scores(
        self, 
        clfs
    ):
        scores = pd.DataFrame([])
        print('Read test data...')
        test_embeddings1 = pd.read_parquet(self.test1_path)
        test_embeddings2 = pd.read_parquet(self.test2_path)
        
        print('Run test concate...')
        test_embeddings = test_embeddings1.merge(test_embeddings2, on=self.col_id, how='outer').fillna(0)
        del test_embeddings1
        del test_embeddings2
        print('Run testing...')
        X_test = test_embeddings.drop(columns=[self.col_id])
        ids = test_embeddings[self.col_id]
        scores[self.col_id] = ids
            
        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score
        
        return scores
    
    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        
        scores.to_csv(self.result_path)
            
        return scores

In [None]:
params = {
    "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "random_state": 42,
      "n_jobs": 8,
}

dw = LateFusion(
    train1_path="trx_baseline_train.parquet",
    test1_path="trx_baseline_test.parquet",
    train2_path="geo_baseline_train.parquet",
    test2_path="geo_baseline_test.parquet",
    params=params,
    result_path='concate_trx_geo_coles.csv' 
)

scores = dw.run()
scores

In [5]:
! python ../scripts/evaluate.py --ref_df_public public_target.parquet --ref_df_private private_target.parquet --pred_df concate_trx_geo_coles.csv --public_result_path public_concate_trx_geo_score.txt --private_result_path private_concate_trx_geo_score.txt























































































































































































































































































