In [2]:
import numpy as np 
import pandas as pd 
import plotly.express as px
from matplotlib import pyplot as plt


import seaborn as sns
import numpy.matlib
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed
import shutil
import glob
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_squared_error
np.random.seed(0)
import os
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from scipy.stats import norm, skew
# train

all_data = pd.read_csv('../data/feature_data_530.csv')
# glove
glove_tags = pd.read_csv('../data/alltags_feature.csv')
glove_title = pd.read_csv('../data/title_feature.csv')
# glove_title = pd.read_csv('../data/title_feature.csv')
all_data = pd.concat([all_data, glove_tags, glove_title], axis=1)
columns = ['Title_len', 'Title_number', 'Alltags_len', 'Alltags_number', 'photo_count', 'totalTags', 'totalGeotagged', 'totalFaves',
          'totalInGroup','photoCount','meanView', 'meanTags', 'meanFaves', 'followerCount','followingCount']
skew_features = all_data[columns].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[abs(skew_features) > 0.75]
skew_index = high_skew.index
for i in skew_index:
    all_data[i] = np.log1p(all_data[i])
    
useless_columns = ['Pid','mean_label'] 
useless_columns += ['user_fe_{}'.format(i) for i in range(399)]
useless_columns += ['loc_fe_{}'.format(i) for i in range(400)]
all_data = all_data.drop(useless_columns, axis=1)


train_all_data = all_data[all_data['train_type'] != -1]
submit_all_data = all_data[all_data['train_type'] == -1]
feature_columns = ['train_type','label']
all_data = all_data.drop(feature_columns, axis=1)

train_all_data = train_all_data.reset_index(drop=True)
submit_all_data = submit_all_data.reset_index(drop=True)


In [3]:
train_label_df = train_all_data[['label']]
train_feature_df = train_all_data.drop(feature_columns, axis=1)

submit_label_df = submit_all_data[['label']]
submit_feature_df = submit_all_data.drop(feature_columns, axis=1)

print(len(train_feature_df), len(submit_feature_df), len(train_feature_df.columns))
print(len(train_label_df), len(submit_label_df), len(train_feature_df.columns))

305613 180581 682
305613 180581 682


In [4]:
# Consider everything as categorical variables might be useful : this is the only trick of this notebook
# CAT_COLS = [c for c in df_train.columns if c.startswith("feature_")] 
categories_columns = ['Uid', 'Category', 'Subcategory', 'Concept', 'Mediatype', 'hour', 'day', 'weekday', 'week_hour', 'year_weekday','Geoaccuracy', 'ispro' , 'Ispublic']
# CAT_COLS = [c for c in df_train.columns if c.startswith("feature_")] 
# NUM_COLS = [] 

# FEATURES = CAT_COLS + NUM_COLS
CAT_COLS = [c for c in categories_columns] 
NUM_COLS = [c for c in all_data.columns if c not in CAT_COLS] 
# NUM_COLS = [] 
FEATURES = CAT_COLS + NUM_COLS

encoders = {}
for cat_col in CAT_COLS:
    label_enc = LabelEncoder()
        
    train_feature_df[cat_col] = label_enc.fit_transform(train_feature_df[cat_col])
    encoders[cat_col] = label_enc

for cat_col in CAT_COLS:
    label_enc = encoders[cat_col]
    le_dict = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
    # Replace unknown values by the most common value
    # Changing this to another value might make more sense
    if le_dict.get("low_frequency") is not None:
        default_val = le_dict["low_frequency"]
    else:
        default_val = train_feature_df[cat_col].mode().values[0]
    submit_feature_df[cat_col] = submit_feature_df[cat_col].apply(lambda x: le_dict.get(x, default_val ))
    
# Clip numerical features in test set to match training set
for num_col in NUM_COLS:
    submit_feature_df[num_col] = np.clip(submit_feature_df[num_col], train_feature_df[num_col].min(), train_feature_df[num_col].max())
# for col in CAT_COLS:
#     l_enc = LabelEncoder()
#     train_feature_df[col] = l_enc.fit_transform(train_feature_df[col].values)
#     categorical_dims[col] = len(l_enc.classes_)
    
# cat_idxs = [ i for i, f in enumerate(all_data.columns.tolist()) if f in categories_columns]

# cat_dims = [ categorical_dims[f] for i, f in enumerate(all_data.columns.tolist()) if f in categories_columns] 





In [5]:
# + NUM_COLS
# FEATURES = CAT_COLS 
cat_dims = train_feature_df[CAT_COLS].nunique().to_list()
cat_idxs = [FEATURES.index(cat_col) for cat_col in CAT_COLS]
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(int).tolist()
# cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist()
# cat_emb_dims=1

X = train_feature_df[FEATURES].values
y = train_label_df['label'].values

X_test = submit_feature_df[FEATURES].values

In [6]:
from pytorch_tabnet.pretraining import TabNetPretrainer
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
N_D = 64 #64 # 32
N_A = 64 # 32
N_INDEP = 1 #2
N_SHARED = 1 #2
N_STEPS = 3 #2
MASK_TYPE = "sparsemax"
GAMMA = 1.2
BS = 85600
MAX_EPOCH =  100
PRETRAIN = True


if PRETRAIN:
    pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS,  #0.2,
                           n_independent=N_INDEP, n_shared=N_SHARED,
                           # device = 'gpu',
                           device_name = 'cuda',
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           gamma=GAMMA,
                           lambda_sparse=0., optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           mask_type=MASK_TYPE,
                           scheduler_params=dict(mode="min",
                                                 patience=3,
                                                 min_lr=1e-5,
                                                 factor=0.5,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,                         
                           verbose=1,
                          )

    pretrainer = TabNetPretrainer(**pretrain_params)

    pretrainer.fit(X_train=X_test, 
                   eval_set=[X],
                   max_epochs=MAX_EPOCH,
                   patience=25, batch_size=BS, virtual_batch_size=BS, #128,
                   num_workers=1, drop_last=True,
                   pretraining_ratio=0.5 # The bigger your pretraining_ratio the harder it is to reconstruct
                  )



epoch 0  | loss: 9761.31812| val_0_unsup_loss_numpy: 435.54290771484375|  0:00:13s
epoch 1  | loss: 3228.27576| val_0_unsup_loss_numpy: 239.22085571289062|  0:00:26s
epoch 2  | loss: 1578.97198| val_0_unsup_loss_numpy: 42.75197982788086|  0:00:39s
epoch 3  | loss: 1016.66782| val_0_unsup_loss_numpy: 27.963619232177734|  0:00:51s
epoch 4  | loss: 602.55606| val_0_unsup_loss_numpy: 21.5297794342041|  0:01:08s
epoch 5  | loss: 353.57324| val_0_unsup_loss_numpy: 17.94179916381836|  0:01:22s
epoch 6  | loss: 204.92841| val_0_unsup_loss_numpy: 14.631319999694824|  0:01:37s
epoch 7  | loss: 120.13941| val_0_unsup_loss_numpy: 13.140230178833008|  0:01:49s
epoch 8  | loss: 96.00509| val_0_unsup_loss_numpy: 10.76550006866455|  0:02:02s
epoch 9  | loss: 58.71519| val_0_unsup_loss_numpy: 8.765219688415527|  0:02:15s
epoch 10 | loss: 38.74199| val_0_unsup_loss_numpy: 6.794950008392334|  0:02:27s
epoch 11 | loss: 38.02461| val_0_unsup_loss_numpy: 5.68025016784668|  0:02:40s
epoch 12 | loss: 26.5178 



In [7]:
from sklearn.model_selection import StratifiedKFold, KFold
# BS = 2048
BS=85600
MAX_EPOCH =  100
LAMBDA_SPARSE = 1e-5 #1e-5
submit_proba = []
N_SPLITS = 5
NB_FOLDS = 5 # max N_SPLITS
# skf = StratifiedKFold(n_splits=N_SPLITS, random_state=2021, shuffle=True)
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from scipy import stats
# k = 0

# for train_idx, valid_idx in kfold.split(train_feature_df, train_label_df):

LR = 1e-1 # 5e-2
fold_nb = 1
for train_index, valid_index in kfold.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index].reshape(-1, 1), y[valid_index].reshape(-1, 1)

    tabnet_params = dict(n_d=N_D, 
                         n_a=N_A,
                         n_steps=N_STEPS, gamma=GAMMA,
                         n_independent=N_INDEP, n_shared=N_SHARED,
                         lambda_sparse=LAMBDA_SPARSE,
                         seed=0,
                         # clip_value=2,
                         cat_idxs=cat_idxs,
                         cat_dims=cat_dims,
                         cat_emb_dim=cat_emb_dims,
                         mask_type=MASK_TYPE,
                         device_name='auto',
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=LR, weight_decay=1e-5),
#                          scheduler_params=dict(max_lr=LR,
#                                                steps_per_epoch=int(X_train.shape[0] / BS),
#                                                epochs=MAX_EPOCH,
#                                                #final_div_factor=100,
#                                                is_batch_level=True),
#                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                              scheduler_params=dict(mode='min',
                                                    factor=0.5,
                                                    patience=3,
                                                    is_batch_level=False,),
                              scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=1)
    # Defining TabNet model
    # model = TabNetClassifier(**tabnet_params)
    model = TabNetRegressor(**tabnet_params)

    model.fit(X_train=X_train, y_train=y_train,
              from_unsupervised=pretrainer if PRETRAIN else None,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_name=["train", "valid"],
              eval_metric=["mae"],
              batch_size=BS,
              virtual_batch_size=256,
              max_epochs=MAX_EPOCH,
              drop_last=True,
              pin_memory=True,
              patience=10,
             )  
    
    valid_pred = model.predict(X_valid).reshape(-1)
    valid_mse = mean_squared_error(y_valid, valid_pred)
    valid_mae = mean_absolute_error(y_valid, valid_pred)
    valid_src = stats.spearmanr(y_valid, valid_pred)[0]
    
    print("MSE: %.4f, MAE: %.4f, SRC: %.4f"%(valid_mse, valid_mae, valid_src))
    test_preds = model.predict(X_test)
    submit_proba.append(test_preds.reshape(-1))
    # submit_proba[model.classes_] += test_preds.reshape(-1)
    fold_nb+=1
    
    if fold_nb > NB_FOLDS:
        break

# df_sub[model.classes_] = df_sub[model.classes_] / NB_FOLDS



epoch 0  | loss: 67.83841| train_mae: 3.20218 | valid_mae: 3.21709 |  0:00:07s
epoch 1  | loss: 11.18268| train_mae: 2.2692  | valid_mae: 2.26563 |  0:00:13s
epoch 2  | loss: 7.17016 | train_mae: 1.96228 | valid_mae: 1.96235 |  0:00:20s
epoch 3  | loss: 5.97309 | train_mae: 1.84325 | valid_mae: 1.83898 |  0:00:27s
epoch 4  | loss: 5.33999 | train_mae: 2.02426 | valid_mae: 2.02601 |  0:00:34s
epoch 5  | loss: 4.60857 | train_mae: 1.8602  | valid_mae: 1.86103 |  0:00:41s
epoch 6  | loss: 4.54146 | train_mae: 2.16754 | valid_mae: 2.15888 |  0:00:47s
epoch 7  | loss: 4.87237 | train_mae: 2.02811 | valid_mae: 2.02132 |  0:00:54s
epoch 8  | loss: 4.15161 | train_mae: 1.79249 | valid_mae: 1.79064 |  0:01:01s
epoch 9  | loss: 4.10211 | train_mae: 2.12655 | valid_mae: 2.1242  |  0:01:08s
epoch 10 | loss: 3.75107 | train_mae: 2.59462 | valid_mae: 2.59278 |  0:01:15s
epoch 11 | loss: 3.80192 | train_mae: 2.09271 | valid_mae: 2.09426 |  0:01:21s
epoch 12 | loss: 3.43539 | train_mae: 1.68802 | vali



MSE: 4.6632, MAE: 1.6952, SRC: 0.5214




epoch 0  | loss: 67.3116 | train_mae: 3.53936 | valid_mae: 3.53779 |  0:00:06s
epoch 1  | loss: 11.00432| train_mae: 2.38244 | valid_mae: 2.38545 |  0:00:13s
epoch 2  | loss: 7.07907 | train_mae: 2.06369 | valid_mae: 2.06375 |  0:00:20s
epoch 3  | loss: 6.28045 | train_mae: 1.95002 | valid_mae: 1.95237 |  0:00:27s
epoch 4  | loss: 6.38232 | train_mae: 1.84318 | valid_mae: 1.84882 |  0:00:34s
epoch 5  | loss: 5.96901 | train_mae: 2.70183 | valid_mae: 2.70496 |  0:00:41s
epoch 6  | loss: 7.41409 | train_mae: 1.87729 | valid_mae: 1.88523 |  0:00:47s
epoch 7  | loss: 5.58371 | train_mae: 2.01861 | valid_mae: 2.02211 |  0:00:54s
epoch 8  | loss: 8.63267 | train_mae: 1.89587 | valid_mae: 1.89541 |  0:01:01s
epoch 9  | loss: 5.49021 | train_mae: 2.48204 | valid_mae: 2.47835 |  0:01:08s
epoch 10 | loss: 4.24653 | train_mae: 2.32368 | valid_mae: 2.3259  |  0:01:15s
epoch 11 | loss: 4.1698  | train_mae: 1.72278 | valid_mae: 1.72645 |  0:01:22s
epoch 12 | loss: 3.80027 | train_mae: 1.70526 | vali



MSE: 1.1268, MAE: 0.7350, SRC: 0.9112




epoch 0  | loss: 65.8426 | train_mae: 3.02324 | valid_mae: 3.01568 |  0:00:06s
epoch 1  | loss: 11.15848| train_mae: 2.20465 | valid_mae: 2.21254 |  0:00:13s
epoch 2  | loss: 7.2657  | train_mae: 1.93352 | valid_mae: 1.93892 |  0:00:23s
epoch 3  | loss: 5.81263 | train_mae: 1.98406 | valid_mae: 1.97649 |  0:00:30s
epoch 4  | loss: 5.25038 | train_mae: 1.87929 | valid_mae: 1.87242 |  0:00:37s
epoch 5  | loss: 4.50471 | train_mae: 2.09199 | valid_mae: 2.08674 |  0:00:43s
epoch 6  | loss: 3.93789 | train_mae: 2.41807 | valid_mae: 2.4147  |  0:00:50s
epoch 7  | loss: 3.58211 | train_mae: 2.18575 | valid_mae: 2.18228 |  0:00:57s
epoch 8  | loss: 3.40522 | train_mae: 2.59608 | valid_mae: 2.58829 |  0:01:04s
epoch 9  | loss: 3.03056 | train_mae: 2.04429 | valid_mae: 2.0535  |  0:01:10s
epoch 10 | loss: 2.84695 | train_mae: 2.17895 | valid_mae: 2.19415 |  0:01:17s
epoch 11 | loss: 2.70511 | train_mae: 1.74402 | valid_mae: 1.76291 |  0:01:24s
epoch 12 | loss: 2.43612 | train_mae: 2.24966 | vali



MSE: 1.1750, MAE: 0.7577, SRC: 0.9025




epoch 0  | loss: 66.71686| train_mae: 3.27632 | valid_mae: 3.27941 |  0:00:06s
epoch 1  | loss: 11.0623 | train_mae: 2.0555  | valid_mae: 2.0461  |  0:00:13s
epoch 2  | loss: 7.10822 | train_mae: 1.94152 | valid_mae: 1.94256 |  0:00:20s
epoch 3  | loss: 7.89374 | train_mae: 2.5224  | valid_mae: 2.50989 |  0:00:27s
epoch 4  | loss: 5.58036 | train_mae: 1.7959  | valid_mae: 1.79164 |  0:00:34s
epoch 5  | loss: 5.24072 | train_mae: 1.75531 | valid_mae: 1.74887 |  0:00:41s
epoch 6  | loss: 4.59979 | train_mae: 2.39396 | valid_mae: 2.3852  |  0:00:47s
epoch 7  | loss: 4.12083 | train_mae: 2.19642 | valid_mae: 2.18984 |  0:00:54s
epoch 8  | loss: 3.71537 | train_mae: 2.603   | valid_mae: 2.59615 |  0:01:01s
epoch 9  | loss: 3.34599 | train_mae: 2.0719  | valid_mae: 2.06753 |  0:01:08s
epoch 10 | loss: 2.87115 | train_mae: 2.2732  | valid_mae: 2.26992 |  0:01:14s
epoch 11 | loss: 2.55452 | train_mae: 2.05176 | valid_mae: 2.05198 |  0:01:21s
epoch 12 | loss: 2.29339 | train_mae: 2.14654 | vali



MSE: 4.9217, MAE: 1.7489, SRC: 0.4106




epoch 0  | loss: 66.70995| train_mae: 4.08361 | valid_mae: 4.08767 |  0:00:06s
epoch 1  | loss: 11.05002| train_mae: 2.20612 | valid_mae: 2.20635 |  0:00:14s
epoch 2  | loss: 7.96812 | train_mae: 2.21974 | valid_mae: 2.20939 |  0:00:20s
epoch 3  | loss: 8.0157  | train_mae: 2.00669 | valid_mae: 2.00161 |  0:00:27s
epoch 4  | loss: 6.59382 | train_mae: 2.66987 | valid_mae: 2.68104 |  0:00:34s
epoch 5  | loss: 7.55469 | train_mae: 1.83114 | valid_mae: 1.83376 |  0:00:41s
epoch 6  | loss: 5.13571 | train_mae: 1.98779 | valid_mae: 1.98939 |  0:00:48s
epoch 7  | loss: 4.6001  | train_mae: 1.76128 | valid_mae: 1.76463 |  0:00:55s
epoch 8  | loss: 4.10591 | train_mae: 1.82136 | valid_mae: 1.82445 |  0:01:01s
epoch 9  | loss: 3.60588 | train_mae: 1.75075 | valid_mae: 1.75699 |  0:01:09s
epoch 10 | loss: 3.1282  | train_mae: 1.64606 | valid_mae: 1.66018 |  0:01:16s
epoch 11 | loss: 2.60313 | train_mae: 1.63031 | valid_mae: 1.65217 |  0:01:22s
epoch 12 | loss: 2.19352 | train_mae: 1.56184 | vali



MSE: 1.7271, MAE: 0.9523, SRC: 0.8549


In [8]:
submit_proba

[array([ 8.489922, 10.063567, 10.48105 , ...,  9.16613 ,  9.477614,
         8.815059], dtype=float32),
 array([ 9.24217  , 12.742173 , 11.854801 , ...,  7.0721893,  7.7995334,
         5.276364 ], dtype=float32),
 array([ 8.973942 , 10.998484 , 10.673437 , ...,  7.9875584,  9.080473 ,
        10.133011 ], dtype=float32),
 array([6.2696753, 9.154137 , 9.3088255, ..., 8.560579 , 9.097906 ,
        8.174707 ], dtype=float32),
 array([ 7.6893454, 11.436663 , 11.60103  , ...,  7.963258 ,  7.764239 ,
         8.322953 ], dtype=float32)]

In [9]:
submit_proba_df = pd.DataFrame(submit_proba)
submit_proba_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,180571,180572,180573,180574,180575,180576,180577,180578,180579,180580
0,8.489922,10.063567,10.48105,5.970233,5.970233,5.970233,5.897909,5.922754,6.012195,6.022112,...,9.605143,5.260343,9.106287,5.419328,5.477552,9.1082,5.731314,9.16613,9.477614,8.815059
1,9.24217,12.742173,11.854801,6.946651,6.946651,6.946651,6.951292,7.11822,6.998113,6.997842,...,9.631916,7.250736,4.211897,3.483494,8.690977,7.022602,5.709376,7.072189,7.799533,5.276364
2,8.973942,10.998484,10.673437,7.474725,7.474725,7.474725,7.573672,7.546614,7.459363,7.459363,...,7.495007,7.576668,8.202629,5.410357,7.181006,8.593084,7.400333,7.987558,9.080473,10.133011
3,6.269675,9.154137,9.308825,6.172396,6.172396,6.172396,6.950052,6.031546,6.521904,5.989501,...,7.845787,5.868503,8.546341,6.176498,6.311522,8.500408,5.672679,8.560579,9.097906,8.174707
4,7.689345,11.436663,11.60103,7.344453,7.344453,7.344453,7.399371,7.521214,7.598316,7.615616,...,8.084208,6.06668,7.264904,5.262511,9.25678,8.031136,6.213771,7.963258,7.764239,8.322953


In [13]:
submit_proba_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,180571,180572,180573,180574,180575,180576,180577,180578,180579,180580
0,11.735038,11.138809,11.652602,6.768065,6.768065,6.768065,6.765581,6.587955,6.426687,6.693336,...,3.283219,1.82619,2.635335,5.520903,6.485078,6.168549,4.931555,6.509181,5.962986,3.899653
1,9.328691,12.250772,11.780432,7.161756,7.161756,7.161756,7.007789,7.063338,6.860058,7.010128,...,7.014502,6.575831,5.914214,4.218393,8.248645,5.947807,6.504625,6.071412,6.409212,7.973583
2,6.110669,11.153367,10.340147,6.090604,6.090604,6.090604,6.255491,6.101233,6.266757,6.502548,...,4.023444,1.867095,2.946946,6.652179,8.638971,5.207249,5.992674,6.460787,6.175128,3.487654
3,6.604742,11.016546,13.976315,6.619453,6.619453,6.619453,6.699306,6.616934,6.614456,6.626598,...,5.642315,3.230766,3.285082,5.520288,11.582042,5.935596,8.202895,4.673194,5.541387,3.573328
4,11.404734,9.483094,8.322335,6.653327,6.653327,6.653327,6.845078,6.719975,6.579948,6.675615,...,7.835574,7.401578,6.62749,5.588782,10.793053,5.367823,8.223528,6.927838,5.09791,7.506988


In [10]:
from scipy import stats
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
r0 = np.array(submit_proba_df.loc[0])
r1 = np.array(submit_proba_df.loc[1])
r2 = np.array(submit_proba_df.loc[2])
r3 = np.array(submit_proba_df.loc[3])
r4 = np.array(submit_proba_df.loc[4])

In [12]:
pd.DataFrame(r1).to_csv('./TabNet1.csv',header=True, index=None)
pd.DataFrame(r2).to_csv('./TabNet2.csv',header=True, index=None)

In [11]:
## 效果比较好的是r1,r2
mae12 = mean_absolute_error(r1, r2)
src12 = stats.spearmanr(r1, r2)[0]
print('mae12', mae12, 'src12',src12)
mae13 = mean_absolute_error(r1, r3)
src13 = stats.spearmanr(r1, r3)[0]
print('mae13', mae13, 'src13',src13)
mae23 = mean_absolute_error(r2, r3)
src23 = stats.spearmanr(r2, r3)[0]
print('mae23', mae23, 'src23',src23)

mae12 1.4619255 src12 0.5971175538911347
mae13 1.4140396 src13 0.5066433724223843
mae23 1.5974623 src23 0.3176376698819749


In [14]:
submit_proba_df.loc[3]

0          6.604742
1         11.016546
2         13.976315
3          6.619453
4          6.619453
            ...    
180576     5.935596
180577     8.202895
180578     4.673194
180579     5.541387
180580     3.573328
Name: 3, Length: 180581, dtype: float32

In [25]:
submit_ans = np.mean(submit_proba_df, axis=0)
submit_ans

0          9.536386
1         10.941848
2         11.910572
3          6.955659
4          6.955659
            ...    
180576     6.200135
180577     6.908788
180578     6.703999
180579     5.915728
180580     5.314737
Length: 180581, dtype: float32

In [26]:
pd.DataFrame(submit_ans).to_csv('./TabNet.csv',header=True, index=None)

In [None]:
result = pd.DataFrame()
result['post_id'] = submit_label_df['Pid'].apply(lambda x: 'post' + str(x))
result['popularity_score'] = submit_ans.round(decimals=4)

out_json = dict()
out_json["version"] = "VERSION 1.0"
out_json["result"] = result.to_dict(orient='records')
out_json["external_data"] = {"used": "true", "details": "catboost"}
f = open('KFold_catboost.json', "w")
json.dump(out_json, f)
f.close()

In [15]:
ss = []
test_preds = model.predict(X_test)

In [24]:
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from scipy import stats
valid_pred = model.predict(X_valid).reshape(-1)
valid_mse = mean_squared_error(y_valid, valid_pred)
valid_mae = mean_absolute_error(y_valid, valid_pred)
valid_src = stats.spearmanr(y_valid, valid_pred)[0]
    
print("MSE: %.4f, MAE: %.4f, SRC: %.4f"%(valid_mse, valid_mae, valid_src))

MSE: 1.0599, MAE: 0.5958, SRC: 0.9169


In [21]:
test_preds.reshape(-1)

array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
        2.0040984], dtype=float32)

In [22]:
ss = []
ss.append(test_preds.reshape(-1))
ss.append(test_preds.reshape(-1))
ss

[array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
         2.0040984], dtype=float32),
 array([ 8.622696 , 12.716267 , 11.696694 , ...,  6.3496876, 10.743238 ,
         2.0040984], dtype=float32)]

In [20]:
test_preds.shape

(180581, 1)