# **Main CITE Notebook**

This is the main notebook for CITE part of the project, where the task is to predict normalized surface protein levels given information about library-size normalized and log1p transformed counts (gene expression levels) for the same cells.

In this Jupyter notebook, data from several sources is joined together and is used further to either cross-validate or create predictions for the test dataset. The sources are:
1. pre-calculated Truncated SVD values from all the RNA expression levels data (see Prepare_SVD_for_CITE notebook)
2. source data for RNA gene expression levels - for pre-selected important genes this data is going to be put into the model as is
3. metadata - cells' donor ID and day the cells were analyzed. Few features are built from metadata information.
4. target values for the train set

In [None]:
# Importing the libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc
from humanize import naturalsize
#import lightgbm
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Need this library to read *.h5 files
!pip install --quiet tables

[0m

# **Loading the Data**

In [3]:
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")
FP_CITE_TEST_INPUTS_FIX = os.path.join(DATA_DIR,"test_cite_inputs_day_2_donor_27678.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [4]:
# Load list of important columns
important_columns_path = '../input/kagglegenes/important_columns.csv'
df_imp_cols =  pd.read_csv(important_columns_path)

In [5]:
%%time

# Load train data. In this case, it is impossible to load only important columns,
# so I have to load all the data, and then remove columns that are not in important columns list.
df = pd.read_hdf(FP_CITE_TRAIN_INPUTS)
#constant_cols = df.columns[df.nunique() <= 1]
#df = df.drop(columns=constant_cols)
df = df[df_imp_cols['important_columns']]
print('import finished')
gc.collect()
df.shape

import finished
CPU times: user 22.8 s, sys: 7.44 s, total: 30.2 s
Wall time: 50.8 s


(70988, 642)

In [6]:
# Check dataframe size
size = df.memory_usage(deep='True').sum()
print(size)
print(naturalsize(size))

187195356
187.2 MB


In [7]:
# I made it possible to run this notebook with SUBMIT = False in case if I am going to run cross-validation only
SUBMIT = True

In [8]:
%%time
# Load the test data (this step is skipped if notebook runs only for cross-validation)

if SUBMIT:
    df_test = pd.read_hdf(FP_CITE_TEST_INPUTS)
    df_test = df_test[df_imp_cols['important_columns']]
    for col in df_test.columns:
        if col in remove_from_batch:
            del df_test[col]
    print('import of test data finished')
    print(df_test.shape)
    

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.63 µs


In [9]:
%%time

# Load metadata and select rows that refer to CITE
md_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
md_df = md_df.loc[md_df['technology'] == "citeseq"]
md_df['day'] = md_df['day'].astype('int8')
del md_df['technology']
# Join the metadata to train.
df = df.join(md_df, how = 'left', on = 'cell_id')
# Join the metadata to test.
if SUBMIT:
    df_test = df_test.join(md_df, how = 'left', on = 'cell_id')
    print(df_test.shape)
del md_df
gc.collect()
df.shape

CPU times: user 417 ms, sys: 64.8 ms, total: 482 ms
Wall time: 596 ms


(70988, 645)

In [10]:
%%time
# Load the prepared TruncatedSVD data and join it to the rest of train/test datasets.
# There was a mess with organizers initially publishing wrong data as part of test set.
# They've later released a fix (see FP_CITE_TEST_INPUTS_FIX) for information only.
# This means participants didn't have to make predictions for the fixed data, but could use it for their models.
# I used that data to calculate the TruncatedSVD.
# This is why this part of code is not as simple as it could be otherwise.

svd_path = '../input/notebookb136a7c42f/svd.csv'
svd = pd.read_csv(svd_path)
svd = svd.add_prefix('svd_')
svd_train = svd.iloc[:70988]
print(svd_train.shape)

#join the svd data to train
df = df.reset_index()
df = pd.concat([df, svd_train], axis=1)

#join the svd data to test
if SUBMIT:
    df_test = df_test.iloc[7476:]
    df_test = df_test.reset_index()
    svd_test = svd.iloc[70988:112175].reset_index(drop = True)
    print(svd_test.shape)
    df_test = pd.concat([df_test, svd_test], axis=1)
    del svd_test

del svd, svd_train
gc.collect()
print(df.shape)
if SUBMIT:
    print(df_test.shape)

(70988, 512)
(70988, 1158)
CPU times: user 13.3 s, sys: 1.37 s, total: 14.7 s
Wall time: 26 s


In [11]:
%%time
# Import the target values and merge them into train dataframe.

Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)
Y = Y.add_prefix('y_')
print(f"Y shape: {str(Y.shape):14} {Y.size*4/1024/1024/1024:2.3f} GByte")
df = df.join(Y, how = 'left', on = 'cell_id')

del Y
gc.collect()
df.shape

Y shape: (70988, 140)   0.037 GByte
CPU times: user 384 ms, sys: 142 ms, total: 527 ms
Wall time: 900 ms


(70988, 1298)

In [15]:
'''
# Experimented with some manually crafted features from raw data, but decided not to include them into final 
# submission.
raw_features_path = '../input/values-from-raw/features_from_raw.ftr'
df_raw_features = pd.read_feather(raw_features_path)
df = df.merge(df_raw_features, how = 'left', on = 'cell_id')
del df_raw_features
gc.collect()
print(df.shape)
if SUBMIT:
    test_raw_features_path = '../input/values-from-raw/features_from_raw_test.ftr'
    df_raw_features_test = pd.read_feather(test_raw_features_path)
    df_test = df_test.merge(df_raw_features_test, how = 'left', on = 'cell_id')
    del df_raw_features_test
    gc.collect()
    print(df_test.shape)
'''

(70988, 1304)


In [12]:
# Made experiments with using TruncatedSVD built on raw data, but cross-validation showed it only decreases
# model's performance, even if I only take first 32 components.
'''
svd_raw_path = '../input/svd-from-raw-data/svd_raw.csv'
svd_raw = pd.read_csv(svd_raw_path)
svd_raw = svd_raw.add_prefix('ext_')
svd_train_raw = svd_raw.iloc[:70988, :32]
#svd = pd.DataFrame(svd, dtype='float16')
print(svd_train_raw.shape)

#join the svd data to train
df = pd.concat([df, svd_train_raw], axis=1)

#join the svd data to test
if SUBMIT:
    print(df_test.shape)
    svd_test_raw = svd_raw.iloc[70988:112175, :32].reset_index(drop=True)
    print(svd_test_raw.shape)
    df_test = pd.concat([df_test, svd_test_raw], axis=1)
    del svd_test_raw

del svd_raw, svd_train_raw
gc.collect()
print(df.shape)
if SUBMIT:
    print(df_test.shape)
'''

(70988, 32)
(70988, 1330)


# **Cross-validation**

In [None]:
lightgbm_params = {
     'learning_rate': 0.05, 
     'max_depth': 7, 
     #'num_leaves': 200,
     'min_child_samples': 100,
     #'colsample_bytree': 0.8, 
     'subsample': 0.6, 
     "seed": 1,
     "device": "gpu",
     "gpu_platform_id": 0,
     "gpu_device_id": 0,
    }

In [17]:
# Function to calculate the competition's custom metric.

def correlation_score(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [18]:
CROSS_VALIDATE = False
if CROSS_VALIDATE:
    df['svd_var1'] = 0
    #df['svd_var2'] = 0
    df['svd_var3'] = 0
    df['svd_var4'] = 0
    df['svd_sex'] = 0
    df.loc[df['donor'] == 13176, 'svd_sex'] = 1
    
    x_cols = [col for col in list(df.columns) if (col.startswith('ENSG')) | (col.startswith('svd'))]
    y_cols = [col for col in list(df.columns) if (col.startswith('y_'))]

In [None]:
'''
# At some point, close to competition end decided to use different catboost parameters depending on target.
cat_params = {
    "random_state": 23,
    "learning_rate" : 0.05,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "task_type" : 'GPU'
    }
'''

In [19]:
# Didn't use very slow and very fast in final submission.
cat_params_very_slow = {
    "random_state": 23,
    "learning_rate" : 0.02,
    "eval_metric" : 'RMSE', 
    "max_depth" : 6,
    "verbose" : 100,
    "reg_lambda" : 15,
    "n_estimators" : 300,
    "task_type" : 'GPU'
    }
cat_params_slow = {
    "random_state": 23,
    "learning_rate" : 0.02,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "n_estimators" : 300,
    "task_type" : 'GPU'
    }
cat_params_middle = {
    "random_state": 23,
    "learning_rate" : 0.03,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "n_estimators" : 400,
    "task_type" : 'GPU'
    }
cat_params_fast = {
    "random_state": 23,
    "learning_rate" : 0.05,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "n_estimators" : 500,
    "task_type" : 'GPU'
    }
cat_params_very_fast = {
    "random_state": 23,
    "learning_rate" : 0.06,
    "eval_metric" : 'RMSE', 
    "max_depth" : 7,
    "verbose" : 100,
    "task_type" : 'GPU'
    }

In [20]:
# Define the targets' grouping depending on how well they can be predicted.
df_model_groups = pd.read_csv('../input/model-groups/model_groups.csv')
df_model_groups['mean'] = df_model_groups.mean(axis=1, numeric_only=True)
df_model_groups['group'] = 2
df_model_groups.loc[df_model_groups['mean'] < 0.25, 'group'] = 1
df_model_groups.loc[df_model_groups['mean'] > 0.5 , 'group'] = 3


In [21]:
y_very_slow = list(df_model_groups.loc[df_model_groups['group'] == 0, 'columns'])
y_slow = list(df_model_groups.loc[df_model_groups['group'] == 1, 'columns'])
y_fast = list(df_model_groups.loc[df_model_groups['group'] == 3, 'columns'])
y_very_fast = list(df_model_groups.loc[df_model_groups['group'] == 4, 'columns'])

In [22]:
# Used this to run cross-validation only on 5 targets of 140 - for test.
#y_cols = y_cols[:5]

# Function to perform cv split by day or by donor.
# Closer to the end of competition, cross-validated only on days.
# Also added cross-validation with evaluation set to check how well models converge.
def custom_cv_split(option, d_frame, USE_EVAL_SET=False):
    d_frame['svd_var1'] = d_frame['day']
    if option > 20:
        cv_col = 'donor'
    else:
        d_frame.loc[d_frame['donor'] == 31800, 'svd_var3'] = 1
        d_frame.loc[d_frame['donor'] == 32606, 'svd_var4'] = 1
        cv_col = 'day'
    Xc_tr = d_frame.loc[d_frame[cv_col] != option, x_cols]
    Xc_va = d_frame.loc[d_frame[cv_col] == option, x_cols]
    yc_va = d_frame.loc[d_frame[cv_col] == option, y_cols]
    if USE_EVAL_SET:
        Xc_tr_cv = Xc_tr.sample(frac=0.1, random_state=25)
        Xc_tr = Xc_tr.drop(Xc_tr_cv.index)
        yc_tr_cv = d_frame[y_cols].iloc[Xc_tr_cv.index]
        yc_tr = d_frame[y_cols].iloc[Xc_tr.index]
        return Xc_tr, yc_tr, Xc_va, yc_va, Xc_tr_cv, yc_tr_cv
    else:
        yc_tr = d_frame[y_cols].iloc[Xc_tr.index]
        return Xc_tr, yc_tr, Xc_va, yc_va

In [27]:
%%time
# Cross-validation with LGBMRegressor or CatboostRegressor in a loop.

if CROSS_VALIDATE:
    df_mse_corr = pd.DataFrame({'columns': y_cols})
    df_importances = pd.DataFrame({'columns':x_cols})
    #cv_options = [2,3,4, 32606, 31800]
    cv_options = [2,3,4]
    score_list = []
    for cv_option in cv_options:
        X_tr, y_tr, X_va, y_va = custom_cv_split(cv_option, df)
        #X_tr, y_tr, X_va, y_va, X_tr_cv, y_tr_cv = custom_cv_split(cv_option, df)
        mse_list = []
        corr_list = []
        for i in range(len(y_cols)):
            print(str(cv_option) + '_' + str(i) + '_' + y_cols[i])
            column_name = str(cv_option) + '_' + y_cols[i]
            y_column_name = 'pred_' + y_cols[i]
            #model = lightgbm.LGBMRegressor(**lightgbm_params)
            #model = CatBoostRegressor(**cat_params)
            if y_cols[i] in y_slow:
                model = CatBoostRegressor(**cat_params_slow)
            elif y_cols[i] in y_fast:
                model = CatBoostRegressor(**cat_params_fast)
            else:
                model = CatBoostRegressor(**cat_params_middle)
            #model.fit(X_tr, y_tr.iloc[:,i].copy(), eval_set = [(X_tr_cv, y_tr_cv.iloc[:,i])])
            model.fit(X_tr, y_tr.iloc[:,i].copy())
            y_va[y_column_name] = model.predict(X_va)
            df_importances[column_name] = model.feature_importances_
            mse_value = mean_squared_error(y_va.iloc[:,i], y_va.iloc[:,i+140], squared=False)
            corr_value = np.corrcoef(y_va.iloc[:,i], y_va.iloc[:,i+140])[1, 0]
            mse_list.append(mse_value)
            corr_list.append(corr_value)
            print(str(mse_value) + '_' + str(corr_value))
        del X_tr, y_tr, X_va
        gc.collect()
        
        # Save the metric values for all 140 models
        cv_option_corr = str(cv_option) + '_corr'
        cv_option_mse = str(cv_option) + '_mse'
        df_mse_corr[cv_option_corr] = corr_list
        df_mse_corr[cv_option_mse] = mse_list   

        # Validate the model (mse and correlation over all 140 columns)
        y_preds = ['pred_' + y for y in y_cols]
        mse = mean_squared_error(y_va[y_cols], y_va[y_preds])
        corrscore = correlation_score(y_va[y_cols], y_va[y_preds])         

        print(f"Fold {cv_option}: mse = {mse:.5f}, corr =  {corrscore:.5f}")

2_0_y_CD86
0:	learn: 1.4827079	total: 29.3ms	remaining: 11.7s
100:	learn: 1.2885777	total: 2.74s	remaining: 8.11s
200:	learn: 1.2531742	total: 5.7s	remaining: 5.64s
300:	learn: 1.2320869	total: 8.3s	remaining: 2.73s
399:	learn: 1.2152226	total: 10.9s	remaining: 0us
1.090263699046933_0.28264380664840844
2_1_y_CD274
0:	learn: 0.9074529	total: 41.4ms	remaining: 12.4s
100:	learn: 0.8690662	total: 2.92s	remaining: 5.74s
200:	learn: 0.8621037	total: 5.68s	remaining: 2.8s
299:	learn: 0.8565087	total: 8.46s	remaining: 0us
0.8409932986444513_0.2773651965526991
2_2_y_CD270
0:	learn: 0.9036108	total: 21.4ms	remaining: 8.54s
100:	learn: 0.8445509	total: 2.86s	remaining: 8.47s
200:	learn: 0.8353111	total: 5.66s	remaining: 5.6s
300:	learn: 0.8272392	total: 8.78s	remaining: 2.89s
399:	learn: 0.8197892	total: 11.6s	remaining: 0us
0.8865987578819697_0.4514789673585789
2_3_y_CD155
0:	learn: 2.4160359	total: 186ms	remaining: 1m 32s
100:	learn: 1.7709730	total: 18.7s	remaining: 1m 13s
200:	learn: 1.727589



2.8556708787674023_0.7828976316532993
2_100_y_CD81
0:	learn: 2.1084167	total: 192ms	remaining: 1m 35s
100:	learn: 1.4588512	total: 18.6s	remaining: 1m 13s
200:	learn: 1.4103177	total: 35.9s	remaining: 53.3s
300:	learn: 1.3828852	total: 52.9s	remaining: 35s
400:	learn: 1.3622600	total: 1m 9s	remaining: 17.2s
499:	learn: 1.3461500	total: 1m 26s	remaining: 0us




1.712971095574104_0.7383543102567783
2_101_y_IgD
0:	learn: 1.0528159	total: 60.4ms	remaining: 18.1s
100:	learn: 1.0295667	total: 2.89s	remaining: 5.7s
200:	learn: 1.0192415	total: 5.77s	remaining: 2.84s
299:	learn: 1.0127384	total: 8.71s	remaining: 0us




1.1389600934508266_0.29368894077017693
2_102_y_CD18
0:	learn: 3.6182769	total: 180ms	remaining: 1m 29s
100:	learn: 3.0124073	total: 17.4s	remaining: 1m 8s
200:	learn: 2.9538204	total: 34s	remaining: 50.6s
300:	learn: 2.9155693	total: 50.3s	remaining: 33.3s
400:	learn: 2.8910283	total: 1m 6s	remaining: 16.4s
499:	learn: 2.8695623	total: 1m 22s	remaining: 0us




3.01243267788032_0.4968707641166644
2_103_y_CD28
0:	learn: 1.0710152	total: 29.8ms	remaining: 11.9s
100:	learn: 0.9884262	total: 2.91s	remaining: 8.61s
200:	learn: 0.9764488	total: 5.72s	remaining: 5.66s
300:	learn: 0.9672489	total: 8.83s	remaining: 2.9s
399:	learn: 0.9581569	total: 11.6s	remaining: 0us




0.9899795720259013_0.38069986174030696
2_104_y_CD38
0:	learn: 2.9834220	total: 195ms	remaining: 1m 37s
100:	learn: 1.9308224	total: 18.7s	remaining: 1m 13s
200:	learn: 1.8674442	total: 36s	remaining: 53.6s
300:	learn: 1.8241453	total: 53.3s	remaining: 35.2s
400:	learn: 1.7927538	total: 1m 10s	remaining: 17.3s
499:	learn: 1.7664522	total: 1m 26s	remaining: 0us




2.250113371237254_0.6541312915357675
2_105_y_CD127
0:	learn: 1.1006848	total: 30.1ms	remaining: 12s
100:	learn: 1.0376836	total: 2.74s	remaining: 8.11s
200:	learn: 1.0265806	total: 5.49s	remaining: 5.43s
300:	learn: 1.0176384	total: 8.24s	remaining: 2.71s
399:	learn: 1.0082241	total: 11.3s	remaining: 0us




1.0232798452217633_0.2794785565840184
2_106_y_CD45
0:	learn: 3.1536657	total: 182ms	remaining: 1m 30s
100:	learn: 2.1712725	total: 17.7s	remaining: 1m 10s
200:	learn: 2.1200310	total: 34s	remaining: 50.6s
300:	learn: 2.0963105	total: 49.9s	remaining: 33s
400:	learn: 2.0764249	total: 1m 5s	remaining: 16.3s
499:	learn: 2.0599279	total: 1m 21s	remaining: 0us




2.617082170881344_0.7369670333124014
2_107_y_CD22
0:	learn: 1.4491385	total: 31.1ms	remaining: 12.4s
100:	learn: 1.3492327	total: 2.66s	remaining: 7.88s
200:	learn: 1.3298925	total: 6.17s	remaining: 6.11s
300:	learn: 1.3168008	total: 8.82s	remaining: 2.9s
399:	learn: 1.3052182	total: 11.4s	remaining: 0us




1.5241139175911111_0.4700596384349763
2_108_y_CD71
0:	learn: 3.3815158	total: 200ms	remaining: 1m 39s
100:	learn: 1.7407079	total: 18s	remaining: 1m 11s
200:	learn: 1.6829205	total: 34.5s	remaining: 51.4s
300:	learn: 1.6526923	total: 50.2s	remaining: 33.2s
400:	learn: 1.6312991	total: 1m 6s	remaining: 16.4s
499:	learn: 1.6158841	total: 1m 21s	remaining: 0us




3.3515210232514114_0.790068305013312
2_109_y_CD26
0:	learn: 2.2139804	total: 128ms	remaining: 1m 3s
100:	learn: 1.6670178	total: 17.2s	remaining: 1m 7s
200:	learn: 1.6309312	total: 33.6s	remaining: 50s
300:	learn: 1.6075589	total: 49.3s	remaining: 32.6s
400:	learn: 1.5875839	total: 1m 5s	remaining: 16.2s
499:	learn: 1.5711923	total: 1m 21s	remaining: 0us




1.5148098203558336_0.5229622775091729
2_110_y_CD115
0:	learn: 1.6779290	total: 193ms	remaining: 1m 36s
100:	learn: 1.1719849	total: 17.8s	remaining: 1m 10s
200:	learn: 1.1517782	total: 33.7s	remaining: 50.2s
300:	learn: 1.1413298	total: 49.7s	remaining: 32.8s
400:	learn: 1.1321298	total: 1m 5s	remaining: 16.1s
499:	learn: 1.1241815	total: 1m 21s	remaining: 0us




1.1101266040613968_0.6965303780520175
2_111_y_CD63
0:	learn: 2.2336846	total: 198ms	remaining: 1m 38s
100:	learn: 1.5768430	total: 18.5s	remaining: 1m 13s
200:	learn: 1.5333274	total: 35.3s	remaining: 52.5s
300:	learn: 1.5083012	total: 51.8s	remaining: 34.3s
400:	learn: 1.4927376	total: 1m 8s	remaining: 16.8s
499:	learn: 1.4764332	total: 1m 24s	remaining: 0us




1.7528388111676416_0.7110480686531788
2_112_y_CD304
0:	learn: 1.3034430	total: 116ms	remaining: 57.9s
100:	learn: 0.9666860	total: 16s	remaining: 1m 3s
200:	learn: 0.9567434	total: 31s	remaining: 46.1s
300:	learn: 0.9499410	total: 46.2s	remaining: 30.6s
400:	learn: 0.9441367	total: 1m 1s	remaining: 15.3s
499:	learn: 0.9405320	total: 1m 17s	remaining: 0us




0.9832339946239789_0.49138357778797137
2_113_y_CD36
0:	learn: 8.1586530	total: 186ms	remaining: 1m 32s
100:	learn: 4.3276360	total: 17.2s	remaining: 1m 7s
200:	learn: 4.1579789	total: 33.6s	remaining: 50s
300:	learn: 4.0645118	total: 49.4s	remaining: 32.7s
400:	learn: 3.9802776	total: 1m 5s	remaining: 16.2s
499:	learn: 3.9206788	total: 1m 21s	remaining: 0us




3.5645205106078786_0.8356992607406163
2_114_y_CD172a
0:	learn: 1.0746544	total: 29.1ms	remaining: 11.6s
100:	learn: 0.9976203	total: 2.7s	remaining: 7.98s
200:	learn: 0.9851462	total: 5.96s	remaining: 5.9s
300:	learn: 0.9762251	total: 9s	remaining: 2.96s
399:	learn: 0.9672150	total: 11.7s	remaining: 0us




0.9883234595669934_0.30742436043361865
2_115_y_CD72
0:	learn: 3.2276633	total: 174ms	remaining: 1m 26s
100:	learn: 2.6824131	total: 16.6s	remaining: 1m 5s
200:	learn: 2.6253403	total: 32.9s	remaining: 48.9s
300:	learn: 2.6002413	total: 48.4s	remaining: 32s
400:	learn: 2.5793042	total: 1m 4s	remaining: 15.8s
499:	learn: 2.5594996	total: 1m 19s	remaining: 0us




2.726133584330411_0.41561679233046755
2_116_y_CD158
0:	learn: 1.8571701	total: 201ms	remaining: 1m 40s
100:	learn: 1.4017210	total: 17.1s	remaining: 1m 7s
200:	learn: 1.3769320	total: 33.5s	remaining: 49.8s
300:	learn: 1.3608729	total: 49.2s	remaining: 32.5s
400:	learn: 1.3477620	total: 1m 5s	remaining: 16.1s
499:	learn: 1.3382909	total: 1m 20s	remaining: 0us




1.8992064761552332_0.5734019906046625
2_117_y_CD93
0:	learn: 1.0619369	total: 31ms	remaining: 12.4s
100:	learn: 0.9987083	total: 2.82s	remaining: 8.34s
200:	learn: 0.9873106	total: 6.42s	remaining: 6.35s
300:	learn: 0.9780089	total: 9.46s	remaining: 3.11s
399:	learn: 0.9684364	total: 12.2s	remaining: 0us




0.9875606282808762_0.2387534504592793
2_118_y_CD49a
0:	learn: 1.2880517	total: 59.4ms	remaining: 23.7s
100:	learn: 1.1801306	total: 3.01s	remaining: 8.93s
200:	learn: 1.1607309	total: 5.67s	remaining: 5.61s
300:	learn: 1.1483976	total: 8.36s	remaining: 2.75s
399:	learn: 1.1372896	total: 11.1s	remaining: 0us




1.1527654660312867_0.1821415821472977
2_119_y_CD49d
0:	learn: 3.4093475	total: 188ms	remaining: 1m 33s
100:	learn: 2.4208096	total: 18.1s	remaining: 1m 11s
200:	learn: 2.3472749	total: 35.1s	remaining: 52.2s
300:	learn: 2.3127306	total: 51.4s	remaining: 34s
400:	learn: 2.2842319	total: 1m 7s	remaining: 16.8s
499:	learn: 2.2606895	total: 1m 23s	remaining: 0us




3.332235093843419_0.6280795462944881
2_120_y_CD73
0:	learn: 0.9667856	total: 21.2ms	remaining: 6.33s
100:	learn: 0.9310417	total: 2.64s	remaining: 5.2s
200:	learn: 0.9240974	total: 5.52s	remaining: 2.72s
299:	learn: 0.9191502	total: 8.87s	remaining: 0us




0.9070219415961556_0.3051014163041236
2_121_y_CD9
0:	learn: 4.6954615	total: 177ms	remaining: 1m 28s
100:	learn: 3.1373593	total: 17s	remaining: 1m 7s
200:	learn: 3.0839784	total: 32.9s	remaining: 48.9s
300:	learn: 3.0513325	total: 48.5s	remaining: 32.1s
400:	learn: 3.0210535	total: 1m 4s	remaining: 15.9s
499:	learn: 3.0000735	total: 1m 19s	remaining: 0us




3.3292713182136375_0.6402741568207561
2_122_y_TCRVa7.2
0:	learn: 0.9814801	total: 29.7ms	remaining: 8.89s
100:	learn: 0.9677578	total: 3.53s	remaining: 6.96s
200:	learn: 0.9611941	total: 6.61s	remaining: 3.25s
299:	learn: 0.9554296	total: 9.32s	remaining: 0us




0.9591990025678887_0.16500530841233124
2_123_y_TCRVd2
0:	learn: 1.0701885	total: 28ms	remaining: 8.39s
100:	learn: 1.0579097	total: 3.13s	remaining: 6.18s
200:	learn: 1.0510381	total: 5.88s	remaining: 2.9s
299:	learn: 1.0448275	total: 8.59s	remaining: 0us




1.0509185349508705_0.13437633820046685
2_124_y_LOX-1
0:	learn: 1.0167158	total: 28.8ms	remaining: 11.5s
100:	learn: 0.9487888	total: 2.83s	remaining: 8.37s
200:	learn: 0.9385312	total: 5.73s	remaining: 5.67s
300:	learn: 0.9301777	total: 8.86s	remaining: 2.91s
399:	learn: 0.9217733	total: 11.8s	remaining: 0us




0.9353955637421217_0.3625496157104319
2_125_y_CD158b
0:	learn: 0.9896427	total: 31ms	remaining: 12.4s
100:	learn: 0.9317358	total: 2.87s	remaining: 8.49s
200:	learn: 0.9213283	total: 5.96s	remaining: 5.9s
300:	learn: 0.9123023	total: 8.78s	remaining: 2.89s
399:	learn: 0.9032569	total: 11.5s	remaining: 0us




0.9353942055219535_0.35117407071187495
2_126_y_CD158e1
0:	learn: 0.9062533	total: 20.2ms	remaining: 6.03s
100:	learn: 0.8893826	total: 2.78s	remaining: 5.48s
200:	learn: 0.8837235	total: 6.14s	remaining: 3.02s
299:	learn: 0.8785300	total: 8.87s	remaining: 0us




0.8636052646210368_0.21077617823732422
2_127_y_CD142
0:	learn: 1.2963373	total: 127ms	remaining: 1m 3s
100:	learn: 1.0200214	total: 17.5s	remaining: 1m 9s
200:	learn: 0.9959354	total: 34.3s	remaining: 51s
300:	learn: 0.9800466	total: 50.8s	remaining: 33.6s
400:	learn: 0.9668266	total: 1m 7s	remaining: 16.7s
499:	learn: 0.9556766	total: 1m 23s	remaining: 0us




0.9583253759112103_0.43809168387402425
2_128_y_CD319
0:	learn: 0.8963283	total: 20.5ms	remaining: 6.12s
100:	learn: 0.8715370	total: 2.72s	remaining: 5.36s
200:	learn: 0.8660163	total: 5.62s	remaining: 2.77s
299:	learn: 0.8614349	total: 8.34s	remaining: 0us




0.8643268411246803_0.2909486169304049
2_129_y_CD352
0:	learn: 0.9582752	total: 19ms	remaining: 7.57s
100:	learn: 0.9152742	total: 2.68s	remaining: 7.93s
200:	learn: 0.9051818	total: 5.62s	remaining: 5.56s
300:	learn: 0.8975133	total: 8.28s	remaining: 2.72s
399:	learn: 0.8896769	total: 11s	remaining: 0us




0.8828997604856681_0.3693849180694268
2_130_y_CD94
0:	learn: 0.8360902	total: 29.2ms	remaining: 8.72s
100:	learn: 0.8170664	total: 2.81s	remaining: 5.53s
200:	learn: 0.8110874	total: 5.61s	remaining: 2.77s
299:	learn: 0.8059696	total: 8.65s	remaining: 0us




0.8090629891109368_0.2800290223141384
2_131_y_CD162
0:	learn: 2.9996349	total: 197ms	remaining: 1m 38s
100:	learn: 2.2884177	total: 18.4s	remaining: 1m 12s
200:	learn: 2.2225765	total: 35.7s	remaining: 53.1s
300:	learn: 2.1857601	total: 52.8s	remaining: 34.9s
400:	learn: 2.1561412	total: 1m 9s	remaining: 17.2s
499:	learn: 2.1304734	total: 1m 26s	remaining: 0us




2.768112554711377_0.6184485934248445
2_132_y_CD85j
0:	learn: 1.1328398	total: 31.6ms	remaining: 12.6s
100:	learn: 1.0091538	total: 2.88s	remaining: 8.54s
200:	learn: 0.9961968	total: 5.69s	remaining: 5.63s
300:	learn: 0.9865939	total: 8.42s	remaining: 2.77s
399:	learn: 0.9772936	total: 11.2s	remaining: 0us




1.01237236743409_0.4572455582414868
2_133_y_CD23
0:	learn: 0.7425012	total: 47.4ms	remaining: 18.9s
100:	learn: 0.7082246	total: 2.96s	remaining: 8.76s
200:	learn: 0.7009690	total: 5.71s	remaining: 5.65s
300:	learn: 0.6951742	total: 8.44s	remaining: 2.78s
399:	learn: 0.6894543	total: 11.2s	remaining: 0us




0.6816738013238328_0.3679848270804334
2_134_y_CD328
0:	learn: 1.8585479	total: 164ms	remaining: 1m 21s
100:	learn: 1.2407181	total: 16.8s	remaining: 1m 6s
200:	learn: 1.2007270	total: 32.3s	remaining: 48s
300:	learn: 1.1870716	total: 47.6s	remaining: 31.4s
400:	learn: 1.1801385	total: 1m 2s	remaining: 15.5s
499:	learn: 1.1745685	total: 1m 17s	remaining: 0us




1.4259250430697088_0.49193065271206987
2_135_y_HLA-E
0:	learn: 0.9153594	total: 21.9ms	remaining: 8.75s
100:	learn: 0.8630421	total: 3.22s	remaining: 9.53s
200:	learn: 0.8542416	total: 5.96s	remaining: 5.9s
300:	learn: 0.8469075	total: 8.72s	remaining: 2.87s
399:	learn: 0.8391472	total: 11.5s	remaining: 0us




0.9228181848172432_0.44107041302230987
2_136_y_CD82
0:	learn: 2.8532329	total: 186ms	remaining: 1m 32s
100:	learn: 1.8632098	total: 18s	remaining: 1m 11s
200:	learn: 1.7948463	total: 34.8s	remaining: 51.7s
300:	learn: 1.7635005	total: 51.2s	remaining: 33.9s
400:	learn: 1.7405663	total: 1m 7s	remaining: 16.6s
499:	learn: 1.7211166	total: 1m 23s	remaining: 0us




2.113087033357763_0.6962882984872508
2_137_y_CD101
0:	learn: 1.4442027	total: 173ms	remaining: 1m 26s
100:	learn: 1.0574472	total: 16.2s	remaining: 1m 4s
200:	learn: 1.0445689	total: 31.5s	remaining: 46.9s
300:	learn: 1.0329747	total: 46.9s	remaining: 31s
400:	learn: 1.0250575	total: 1m 2s	remaining: 15.4s
499:	learn: 1.0209238	total: 1m 17s	remaining: 0us




0.9493484172759584_0.473517989219019
2_138_y_CD88
0:	learn: 2.6802838	total: 205ms	remaining: 1m 42s
100:	learn: 1.5195631	total: 18.3s	remaining: 1m 12s
200:	learn: 1.4824864	total: 34.8s	remaining: 51.8s
300:	learn: 1.4594721	total: 50.7s	remaining: 33.5s
400:	learn: 1.4389514	total: 1m 6s	remaining: 16.5s
499:	learn: 1.4254526	total: 1m 22s	remaining: 0us




1.5752035197239032_0.804431105151144
2_139_y_CD224
0:	learn: 2.3871551	total: 31.6ms	remaining: 12.6s
100:	learn: 1.9906253	total: 3.66s	remaining: 10.8s
200:	learn: 1.9424678	total: 6.48s	remaining: 6.41s
300:	learn: 1.9133432	total: 9.29s	remaining: 3.06s
399:	learn: 1.8895090	total: 12.1s	remaining: 0us




2.051236612990146_0.642707868188739
Fold 2: mse = 2.73480, corr =  0.89204
3_0_y_CD86
0:	learn: 1.4325948	total: 26.8ms	remaining: 10.7s
100:	learn: 1.2421897	total: 2.62s	remaining: 7.75s
200:	learn: 1.2054375	total: 5.34s	remaining: 5.29s
300:	learn: 1.1847601	total: 7.87s	remaining: 2.59s
399:	learn: 1.1681811	total: 10.4s	remaining: 0us




1.1996786949625096_0.35390830097279186
3_1_y_CD274
0:	learn: 0.8644669	total: 26.4ms	remaining: 7.89s
100:	learn: 0.8261251	total: 2.63s	remaining: 5.18s
200:	learn: 0.8197614	total: 5.28s	remaining: 2.6s
299:	learn: 0.8149646	total: 7.92s	remaining: 0us




0.9338567399568065_0.23810632897638032
3_2_y_CD270
0:	learn: 0.9205563	total: 20.9ms	remaining: 8.33s
100:	learn: 0.8284982	total: 2.7s	remaining: 8s
200:	learn: 0.8191439	total: 5.38s	remaining: 5.33s
300:	learn: 0.8116649	total: 8.43s	remaining: 2.77s
399:	learn: 0.8040996	total: 11.1s	remaining: 0us




0.913109845211666_0.24261033781308927
3_3_y_CD155
0:	learn: 2.4609340	total: 29.2ms	remaining: 14.6s
100:	learn: 1.7557367	total: 3.17s	remaining: 12.5s
200:	learn: 1.7070827	total: 5.91s	remaining: 8.79s
300:	learn: 1.6738142	total: 8.6s	remaining: 5.68s
400:	learn: 1.6433149	total: 11.3s	remaining: 2.8s
499:	learn: 1.6155289	total: 14.3s	remaining: 0us




1.9614151656344914_0.6526078845301009
3_4_y_CD112
0:	learn: 1.9594940	total: 28.4ms	remaining: 14.1s
100:	learn: 1.2244162	total: 2.88s	remaining: 11.4s
200:	learn: 1.1771706	total: 5.59s	remaining: 8.32s
300:	learn: 1.1532352	total: 8.25s	remaining: 5.45s
400:	learn: 1.1329153	total: 11.6s	remaining: 2.87s
499:	learn: 1.1152063	total: 14.2s	remaining: 0us




1.471267076623945_0.7149913659800167
3_5_y_CD47
0:	learn: 3.5105283	total: 32.4ms	remaining: 16.2s
100:	learn: 2.5448983	total: 2.83s	remaining: 11.2s
200:	learn: 2.4571849	total: 5.5s	remaining: 8.18s
300:	learn: 2.4020081	total: 8.16s	remaining: 5.39s
400:	learn: 2.3585650	total: 10.8s	remaining: 2.67s
499:	learn: 2.3186200	total: 13.5s	remaining: 0us




2.639104401614465_0.6638831072784749
3_6_y_CD48
0:	learn: 5.0777937	total: 29.5ms	remaining: 14.7s
100:	learn: 2.8184411	total: 3.11s	remaining: 12.3s
200:	learn: 2.6769616	total: 6.15s	remaining: 9.15s
300:	learn: 2.5935906	total: 8.83s	remaining: 5.84s
400:	learn: 2.5315141	total: 11.9s	remaining: 2.94s
499:	learn: 2.4753306	total: 14.6s	remaining: 0us




3.0399911238045996_0.8415588481673681
3_7_y_CD40
0:	learn: 1.0540513	total: 26.4ms	remaining: 10.5s
100:	learn: 0.9472609	total: 2.66s	remaining: 7.89s
200:	learn: 0.9341834	total: 5.37s	remaining: 5.32s
300:	learn: 0.9260311	total: 7.92s	remaining: 2.6s
399:	learn: 0.9176678	total: 10.5s	remaining: 0us




0.9872341069124682_0.3315467248278458
3_8_y_CD154
0:	learn: 0.8485182	total: 21.4ms	remaining: 8.55s
100:	learn: 0.7498403	total: 2.77s	remaining: 8.19s
200:	learn: 0.7408267	total: 5.46s	remaining: 5.41s
300:	learn: 0.7340866	total: 8.35s	remaining: 2.74s
399:	learn: 0.7278402	total: 11.1s	remaining: 0us




0.8388987554726303_0.30891126633650884
3_9_y_CD52
0:	learn: 1.2543458	total: 35.6ms	remaining: 14.2s
100:	learn: 1.0406819	total: 2.79s	remaining: 8.26s
200:	learn: 1.0217470	total: 5.66s	remaining: 5.6s
300:	learn: 1.0103961	total: 8.36s	remaining: 2.75s
399:	learn: 0.9999652	total: 11s	remaining: 0us




1.1005340894818927_0.4308833439575905
3_10_y_CD3
0:	learn: 0.8364383	total: 27.3ms	remaining: 8.15s
100:	learn: 0.8232906	total: 2.65s	remaining: 5.23s
200:	learn: 0.8183787	total: 5.29s	remaining: 2.6s
299:	learn: 0.8140427	total: 8.13s	remaining: 0us




0.8649853310801748_0.11798268816680349
3_11_y_CD8
0:	learn: 1.1282937	total: 28ms	remaining: 8.36s
100:	learn: 1.1003416	total: 2.81s	remaining: 5.53s
200:	learn: 1.0914516	total: 5.55s	remaining: 2.73s
299:	learn: 1.0845466	total: 8.31s	remaining: 0us




1.199782929921593_0.1637451475932194
3_12_y_CD56
0:	learn: 1.0233136	total: 26.5ms	remaining: 7.91s
100:	learn: 0.9727196	total: 3.46s	remaining: 6.81s
200:	learn: 0.9608574	total: 6.16s	remaining: 3.03s
299:	learn: 0.9532400	total: 8.73s	remaining: 0us




1.0687768108404794_0.20970917285795665
3_13_y_CD19
0:	learn: 1.1606443	total: 32.8ms	remaining: 13.1s
100:	learn: 1.0124443	total: 2.82s	remaining: 8.35s
200:	learn: 0.9904216	total: 5.33s	remaining: 5.28s
300:	learn: 0.9806243	total: 7.85s	remaining: 2.58s
399:	learn: 0.9724117	total: 10.5s	remaining: 0us




1.0840872475069745_0.38150727016947106
3_14_y_CD33
0:	learn: 4.9354752	total: 30.6ms	remaining: 15.3s
100:	learn: 3.6085164	total: 2.9s	remaining: 11.4s
200:	learn: 3.5031688	total: 6.27s	remaining: 9.33s
300:	learn: 3.4323850	total: 9.18s	remaining: 6.07s
400:	learn: 3.3696020	total: 11.9s	remaining: 2.94s
499:	learn: 3.3110247	total: 14.6s	remaining: 0us




4.053477982061728_0.6119324819392492
3_15_y_CD11c
0:	learn: 2.9130381	total: 28.6ms	remaining: 14.3s
100:	learn: 2.1622015	total: 2.57s	remaining: 10.2s
200:	learn: 2.0834291	total: 5.11s	remaining: 7.6s
300:	learn: 2.0292960	total: 7.63s	remaining: 5.04s
400:	learn: 1.9867415	total: 10.5s	remaining: 2.6s
499:	learn: 1.9515376	total: 13.1s	remaining: 0us




1.9740435031808004_0.47252164838588173
3_16_y_HLA-A-B-C
0:	learn: 3.2314215	total: 30ms	remaining: 15s
100:	learn: 2.2728814	total: 3.08s	remaining: 12.2s
200:	learn: 2.1832745	total: 5.75s	remaining: 8.55s
300:	learn: 2.1280725	total: 8.42s	remaining: 5.57s
400:	learn: 2.0863869	total: 11.1s	remaining: 2.73s
499:	learn: 2.0483687	total: 14s	remaining: 0us




2.616659231077533_0.5939477092375225
3_17_y_CD45RA
0:	learn: 2.5554369	total: 30.6ms	remaining: 15.3s
100:	learn: 1.5345261	total: 2.78s	remaining: 11s
200:	learn: 1.4670021	total: 5.8s	remaining: 8.63s
300:	learn: 1.4255845	total: 8.48s	remaining: 5.61s
400:	learn: 1.3940447	total: 11.9s	remaining: 2.94s
499:	learn: 1.3651428	total: 14.6s	remaining: 0us




1.710639675701993_0.8054715437175336
3_18_y_CD123
0:	learn: 2.5630793	total: 30.2ms	remaining: 15.1s
100:	learn: 1.8831515	total: 2.84s	remaining: 11.2s
200:	learn: 1.8334505	total: 5.61s	remaining: 8.34s
300:	learn: 1.7950030	total: 8.61s	remaining: 5.69s
400:	learn: 1.7598621	total: 11.3s	remaining: 2.79s
499:	learn: 1.7268650	total: 14s	remaining: 0us




2.360118656279388_0.6174362722113143
3_19_y_CD7
0:	learn: 1.9652514	total: 26.6ms	remaining: 10.6s
100:	learn: 1.8676402	total: 3.25s	remaining: 9.63s
200:	learn: 1.8391494	total: 5.81s	remaining: 5.75s
300:	learn: 1.8173387	total: 8.4s	remaining: 2.76s
399:	learn: 1.7963663	total: 11.2s	remaining: 0us




2.0881427379235387_0.2573385290391745
3_20_y_CD105
0:	learn: 0.9154115	total: 26.8ms	remaining: 10.7s
100:	learn: 0.8265237	total: 2.68s	remaining: 7.94s
200:	learn: 0.8133610	total: 5.67s	remaining: 5.61s
300:	learn: 0.8048062	total: 8.34s	remaining: 2.74s
399:	learn: 0.7970588	total: 11s	remaining: 0us




0.8776325243670826_0.3331302568704448
3_21_y_CD49f
0:	learn: 2.2169404	total: 30.3ms	remaining: 15.1s
100:	learn: 1.3483728	total: 3.48s	remaining: 13.7s
200:	learn: 1.3015695	total: 6.16s	remaining: 9.16s
300:	learn: 1.2734993	total: 8.83s	remaining: 5.84s
400:	learn: 1.2490820	total: 11.7s	remaining: 2.89s
499:	learn: 1.2285830	total: 14.4s	remaining: 0us




1.487313649639139_0.7177166686082797
3_22_y_CD194
0:	learn: 1.0174667	total: 27.6ms	remaining: 8.27s
100:	learn: 1.0039302	total: 2.94s	remaining: 5.79s
200:	learn: 0.9972606	total: 5.54s	remaining: 2.73s
299:	learn: 0.9917365	total: 8.11s	remaining: 0us




1.0594389314058978_0.05975386844841544
3_23_y_CD4
0:	learn: 1.2060628	total: 29.6ms	remaining: 11.8s
100:	learn: 1.0983268	total: 2.71s	remaining: 8.04s
200:	learn: 1.0823205	total: 5.34s	remaining: 5.29s
300:	learn: 1.0709774	total: 8.01s	remaining: 2.63s
399:	learn: 1.0606009	total: 11.1s	remaining: 0us




1.2212242539942926_0.3450548403462737
3_24_y_CD44
0:	learn: 4.5867824	total: 31.2ms	remaining: 15.6s
100:	learn: 3.1824844	total: 2.85s	remaining: 11.2s
200:	learn: 3.0893627	total: 5.92s	remaining: 8.81s
300:	learn: 3.0282555	total: 8.62s	remaining: 5.7s
400:	learn: 2.9761489	total: 11.3s	remaining: 2.79s
499:	learn: 2.9271727	total: 14.1s	remaining: 0us




3.159380949348425_0.6444753034500678
3_25_y_CD14
0:	learn: 0.9610471	total: 27.4ms	remaining: 8.2s
100:	learn: 0.9200019	total: 2.56s	remaining: 5.04s
200:	learn: 0.9121440	total: 5.16s	remaining: 2.54s
299:	learn: 0.9075701	total: 7.94s	remaining: 0us




0.977000026084585_0.15723566670908926
3_26_y_CD16
0:	learn: 0.8884516	total: 29.1ms	remaining: 8.7s
100:	learn: 0.8770637	total: 2.67s	remaining: 5.27s
200:	learn: 0.8721182	total: 5.58s	remaining: 2.75s
299:	learn: 0.8675100	total: 8.15s	remaining: 0us




0.936158545404785_0.07390092651133359
3_27_y_CD25
0:	learn: 0.8705016	total: 40.8ms	remaining: 12.2s
100:	learn: 0.8477188	total: 3.34s	remaining: 6.58s
200:	learn: 0.8417636	total: 5.93s	remaining: 2.92s
299:	learn: 0.8374255	total: 8.53s	remaining: 0us




0.9337899273526886_0.1944363885670988
3_28_y_CD45RO
0:	learn: 0.8671376	total: 28.2ms	remaining: 11.2s
100:	learn: 0.7498494	total: 2.71s	remaining: 8.04s
200:	learn: 0.7405529	total: 5.39s	remaining: 5.34s
300:	learn: 0.7336528	total: 8.06s	remaining: 2.65s
399:	learn: 0.7266730	total: 11.1s	remaining: 0us




0.8530774244615961_0.287453097443471
3_29_y_CD279
0:	learn: 0.7794352	total: 27.7ms	remaining: 11s
100:	learn: 0.7251543	total: 3.26s	remaining: 9.65s
200:	learn: 0.7183073	total: 6.15s	remaining: 6.09s
300:	learn: 0.7121481	total: 8.83s	remaining: 2.9s
399:	learn: 0.7059432	total: 11.5s	remaining: 0us




0.8066504544935111_0.2761261039208585
3_30_y_TIGIT
0:	learn: 0.8177205	total: 22.1ms	remaining: 6.62s
100:	learn: 0.7754651	total: 2.74s	remaining: 5.4s
200:	learn: 0.7692347	total: 5.48s	remaining: 2.7s
299:	learn: 0.7644484	total: 8.17s	remaining: 0us




0.8574955659617334_0.21615181784926277
3_31_y_Mouse-IgG1
0:	learn: 0.8525562	total: 20.3ms	remaining: 6.06s
100:	learn: 0.8071755	total: 3.07s	remaining: 6.05s
200:	learn: 0.8005115	total: 6.01s	remaining: 2.96s
299:	learn: 0.7955522	total: 9.04s	remaining: 0us




0.8601104429447857_0.2038156294873451
3_32_y_Mouse-IgG2a
0:	learn: 0.7662086	total: 20.8ms	remaining: 6.22s
100:	learn: 0.7388499	total: 2.61s	remaining: 5.14s
200:	learn: 0.7339496	total: 5.64s	remaining: 2.78s
299:	learn: 0.7296645	total: 8.3s	remaining: 0us




0.8148255919468964_0.14662779815691676
3_33_y_Mouse-IgG2b
0:	learn: 0.7724538	total: 20.2ms	remaining: 6.03s
100:	learn: 0.7504497	total: 2.99s	remaining: 5.88s
200:	learn: 0.7454946	total: 5.67s	remaining: 2.79s
299:	learn: 0.7412662	total: 9.02s	remaining: 0us




0.8101193443200871_0.14259588102617196
3_34_y_Rat-IgG2b
0:	learn: 0.9286297	total: 22ms	remaining: 6.58s
100:	learn: 0.9058279	total: 2.7s	remaining: 5.33s
200:	learn: 0.9000281	total: 5.34s	remaining: 2.63s
299:	learn: 0.8947218	total: 7.98s	remaining: 0us




0.9559819724395239_0.08891732034775565
3_35_y_CD20
0:	learn: 0.8733606	total: 25.7ms	remaining: 7.69s
100:	learn: 0.8366055	total: 2.52s	remaining: 4.96s
200:	learn: 0.8287343	total: 5s	remaining: 2.46s
299:	learn: 0.8241648	total: 7.76s	remaining: 0us




0.8906677719866696_0.19985215374918594
3_36_y_CD335
0:	learn: 1.0318265	total: 29.4ms	remaining: 11.7s
100:	learn: 0.9128222	total: 2.77s	remaining: 8.19s
200:	learn: 0.9023722	total: 5.81s	remaining: 5.75s
300:	learn: 0.8942429	total: 8.52s	remaining: 2.8s
399:	learn: 0.8857875	total: 11.2s	remaining: 0us




1.0052516249545513_0.36591741279543566
3_37_y_CD31
0:	learn: 6.2399909	total: 29.8ms	remaining: 14.9s
100:	learn: 4.2575611	total: 2.81s	remaining: 11.1s
200:	learn: 4.1174767	total: 5.53s	remaining: 8.23s
300:	learn: 4.0309750	total: 8.24s	remaining: 5.45s
400:	learn: 3.9565038	total: 11.2s	remaining: 2.77s
499:	learn: 3.8898792	total: 14.7s	remaining: 0us




4.347840106695576_0.6591447251335629
3_38_y_Podoplanin
0:	learn: 0.9344979	total: 22.5ms	remaining: 8.99s
100:	learn: 0.8337442	total: 3.14s	remaining: 9.3s
200:	learn: 0.8251478	total: 5.83s	remaining: 5.78s
300:	learn: 0.8183346	total: 8.52s	remaining: 2.8s


KeyboardInterrupt: 

In [None]:
# Export the cross-validation predictions for a second-level model.
# Wasn't used in final submission.

if CROSS_VALIDATE & SUBMIT:
    for cols in df_cv_pred:
        if col in y_cols:
            del df_cv_pred[col]
    gc.collect()
    df_cv_pred = df_cv_pred.reset_index()
    export_file_results = 'results_cite_cv.ftr'
    df_cv_pred.to_feather(export_file_results)
    del df_cv_pred


In [None]:
if CROSS_VALIDATE:
    for col in ['svd_var1', 'svd_var2', 'svd_var3', 'svd_var4', 'svd_sex']:
        if col in df.columns:
            del df[col]

gc.collect()

In [None]:
# Save the feature importance data.

if CROSS_VALIDATE & SUBMIT:
    export_file_importances = 'imps_cite.ftr'
    df_importances.to_feather(export_file_importances)
    del df_importances

gc.collect()

# **Prediction**

In [None]:
# Create features from metadata.
# Note: I couldn't use get_dummies function here, as one of days and one of donors only appear in test dataset.

def add_metadata_features(d_frame):
    d_frame['svd_x_donor_13176'] = 0
    d_frame['svd_x_donor_31800'] = 0
    d_frame['svd_x_donor_32606'] = 0
    d_frame.loc[d_frame['donor'] == 13176, 'svd_x_donor_13176'] = 1
    d_frame.loc[d_frame['donor'] == 31800, 'svd_x_donor_31800'] = 1
    d_frame.loc[d_frame['donor'] == 32606, 'svd_x_donor_32606'] = 1
    d_frame['svd_x_day'] = d_frame['day']
    return d_frame


In [None]:
# Predict. Ran a few time with cross-validation on 10% of data to check if it makes sense to add more
# iterations. 
'''
if SUBMIT:
    df = add_metadata_features(df)
    df_test = add_metadata_features(df_test)
    x_cols = [col for col in list(df.columns) if (col.startswith('ENSG')) | (col.startswith('svd'))]
    y_cols = [col for col in list(df.columns) if (col.startswith('y_'))]
    best_iteration = []
    best_cv_result = []
    #y_cols = y_cols[:5]
    df_cv = df.sample(frac=0.1, random_state=25)
    df= df.drop(df_cv.index)
    X = df[x_cols].values
    Y = df[y_cols].values
    X_cv = df_cv[x_cols].values
    Y_cv = df_cv[y_cols].values
    for i in range(len(y_cols)):
        print('Training_column: ' + str(i))
        #model = lightgbm.LGBMRegressor(n_estimators=n_estimators, **lightgbm_params)
        #model = CatBoostRegressor(n_estimators=n_estimators, **cat_params)
        if y_cols[i] in y_very_slow:
            model = CatBoostRegressor(**cat_params_very_slow)
        elif y_cols[i] in y_slow:
            model = CatBoostRegressor(**cat_params_slow)
        elif y_cols[i] in y_fast:
            model = CatBoostRegressor(**cat_params_fast)
        elif y_cols[i] in y_very_fast:
            model = CatBoostRegressor(**cat_params_very_fast)
        else:
            model = CatBoostRegressor(**cat_params_middle)
        model.fit(X, Y[:,i].copy(), eval_set=[(X_cv, Y_cv[:,i].copy())])
        best_iteration.append(model.best_iteration_)
        best_cv_result.append(correlation_value)
        correlation_value = np.corrcoef(model.predict(X_cv),  Y_cv[:,i])[1, 0]
        print(str(correlation_value))
'''

In [None]:
# Make predictions.
if SUBMIT:
    df = add_metadata_features(df)
    df_test = add_metadata_features(df_test)
    x_cols = [col for col in list(df.columns) if (col.startswith('ENSG')) | (col.startswith('svd'))]
    y_cols = [col for col in list(df.columns) if (col.startswith('y_'))]
    #y_cols = y_cols[:5]
    X = df[x_cols].values
    Y = df[y_cols].values
    Xt = df_test[x_cols].values
    for i in range(len(y_cols)):
        print('Training_column: ' + str(i))
        #model = lightgbm.LGBMRegressor(n_estimators=n_estimators, **lightgbm_params)
        #model = CatBoostRegressor(n_estimators=n_estimators, **cat_params)
        if y_cols[i] in y_very_slow:
            model = CatBoostRegressor(**cat_params_very_slow)
        elif y_cols[i] in y_slow:
            model = CatBoostRegressor(**cat_params_slow)
        elif y_cols[i] in y_fast:
            model = CatBoostRegressor(**cat_params_fast)
        elif y_cols[i] in y_very_fast:
            model = CatBoostRegressor(**cat_params_very_fast)
        else:
            model = CatBoostRegressor(**cat_params_middle)
        model.fit(X, Y[:,i].copy())
        col_name = y_cols[i]
        df_test[col_name] = model.predict(Xt)


In [None]:
if SUBMIT:
    del df, X, Y, Xt
    for col in df_test.columns:
        if col in x_cols:
            del df_test[col]
    gc.collect()

In [None]:
# Export predictions to file.
if SUBMIT:
    export_file_results = 'results_cite.ftr'
    df_test.to_feather(export_file_results)