In [1]:
import numpy as np
import itertools
import pandas as pd
import json

from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve, _SigmoidCalibration
from ampligraph.evaluation import evaluate_performance, mr_score, mrr_score, hits_at_n_score, generate_corruptions_for_eval
from sklearn.metrics import brier_score_loss, log_loss, accuracy_score
from scipy.special import expit

from ampligraph.datasets import load_wordnet11
from ampligraph.latent_features.models import TransE, ComplEx, DistMult

In [2]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [3]:
X_train = (pd.read_csv("TransC/data/YAGO39K/Train/triple2id.txt", sep=' ', skiprows=1, names=['s', 'o', 'p'])
           [['s', 'p', 'o']]
           .reindex()
           .values)

In [4]:
ent = np.unique(np.concatenate((X_train[:, 0], X_train[:, 2])))

In [5]:
X_valid_pos = (pd.read_csv("TransC/data/YAGO39K/Valid/triple2id_positive.txt", sep=' ', skiprows=1, names=['s', 'o', 'p'])
               [['s', 'p', 'o']]
               .reindex()
               .query("s in @ent and o in @ent")
               .values)

X_valid_neg = (pd.read_csv("TransC/data/YAGO39K/Valid/triple2id_negative.txt", sep=' ', skiprows=1, names=['s', 'o', 'p'])
               [['s', 'p', 'o']]
               .reindex()
               .query("s in @ent and o in @ent")
               .values)

In [6]:
X_test_pos = (pd.read_csv("TransC/data/YAGO39K/Valid/triple2id_positive.txt", sep=' ', skiprows=1, names=['s', 'o', 'p'])
              [['s', 'p', 'o']]
              .reindex()
              .query("s in @ent and o in @ent")
              .values)

X_test_neg = (pd.read_csv("TransC/data/YAGO39K/Valid/triple2id_negative.txt", sep=' ', skiprows=1, names=['s', 'o', 'p'])
              [['s', 'p', 'o']]
              .query("s in @ent and o in @ent")
              .reindex()
              .values)

In [7]:
X = {
    'train': X_train,
    'valid': np.concatenate((X_valid_pos, X_valid_neg)),
    'test': np.concatenate((X_test_pos, X_test_neg)),
    'valid_labels': np.concatenate((np.full(len(X_valid_pos), '1'), np.full(len(X_valid_neg), '-1'))),
    'test_labels': np.concatenate((np.full(len(X_test_pos), '1'), np.full(len(X_test_neg), '-1')))
}

In [8]:
import types
from functools import partial
import tensorflow as tf
from sklearn.utils import check_random_state
from tqdm import tqdm
from ampligraph.datasets import AmpligraphDatasetAdapter, NumpyDatasetAdapter
from ampligraph.evaluation import generate_corruptions_for_fit, to_idx, generate_corruptions_for_eval, \
    hits_at_n_score, mrr_score


def generate_corruptions(self, X_pos, batches_count, epochs):
    try:
        tf.reset_default_graph()
        self.rnd = check_random_state(self.seed)
        tf.random.set_random_seed(self.seed)

        self._load_model_from_trained_params()

        dataset_handle = NumpyDatasetAdapter()
        dataset_handle.use_mappings(self.rel_to_idx, self.ent_to_idx)

        dataset_handle.set_data(X_pos, "pos")

        batch_size_pos = int(np.ceil(dataset_handle.get_size("pos") / batches_count))

        gen_fn = partial(dataset_handle.get_next_train_batch, batch_size=batch_size_pos, dataset_type="pos")
        dataset = tf.data.Dataset.from_generator(gen_fn,
                                                 output_types=tf.int32,
                                                 output_shapes=(None, 3))
        dataset = dataset.repeat().prefetch(1)
        dataset_iter = tf.data.make_one_shot_iterator(dataset)

        x_pos_tf = dataset_iter.get_next()

        e_s, e_p, e_o = self._lookup_embeddings(x_pos_tf)
        scores_pos = self._fn(e_s, e_p, e_o)

        x_neg_tf = generate_corruptions_for_fit(x_pos_tf,
                                                entities_list=None,
                                                eta=1,
                                                corrupt_side='s+o',
                                                entities_size=0,
                                                rnd=self.seed)

        e_s_neg, e_p_neg, e_o_neg = self._lookup_embeddings(x_neg_tf)
        scores = self._fn(e_s_neg, e_p_neg, e_o_neg)

        epoch_iterator_with_progress = tqdm(range(1, epochs + 1), disable=(not self.verbose), unit='epoch')

        scores_list = []
        with tf.Session(config=self.tf_config) as sess:
            sess.run(tf.global_variables_initializer())
            for _ in epoch_iterator_with_progress:
                losses = []
                for batch in range(batches_count):
                    scores_list.append(sess.run(scores))

        dataset_handle.cleanup()
        return np.concatenate(scores_list)
    
    except Exception as e:
        dataset_handle.cleanup()
        raise e

In [9]:
def pos_iso(cal_model, pos_scores, neg_scores, positive_base_rate):
    weigths_pos = len(neg_scores) / len(pos_scores)
    weights_neg = (1.0 - positive_base_rate) / positive_base_rate
    weights = np.concatenate((np.full(pos_scores.shape, weigths_pos),
                              np.full(neg_scores.shape, weights_neg))).astype(float)
    target =  np.concatenate((np.ones(pos_scores.shape), np.zeros(neg_scores.shape))).astype(float)
    x = np.concatenate((pos_scores, neg_scores)).astype(float)
    
    cal_model.fit(x, target, sample_weight=weights)
    return cal_model

In [None]:
losses =  ['self_adversarial', 'pairwise', 'nll', 'multiclass_nll']
models = [TransE, DistMult, ComplEx]

results = []

for m, l in itertools.product(models, losses):
    model = m(batches_count=64, seed=0, epochs=1000, k=100, eta=20,
                   optimizer='adam', optimizer_params={'lr':0.0001},
                   loss=l, verbose=False)

    model.fit(X['train'])
    
    scores = model.predict(X['test'])

    model.calibrate(X_valid_pos, batches_count=10, epochs=1000, positive_base_rate=0.5)
    print("pos", model.calibration_parameters)
    probas1 = model.predict_proba(X['test'])

    model.calibrate(X_valid_pos, X_valid_neg)
    print("pos neg", model.calibration_parameters)
    probas2 = model.predict_proba(X['test'])
    
    val_scores = model.predict(X['valid'])
    ir = IsotonicRegression(out_of_bounds='clip')
    ir.fit(np.squeeze(val_scores).astype(float), (X['valid_labels'] == "1").astype(float))
    probas3 = ir.predict(np.squeeze(scores).astype(float))
    
    model.generate_corruptions = types.MethodType(generate_corruptions, model)
    corruptions = model.generate_corruptions(X_valid_pos, batches_count=10, epochs=1000)
    val_pos_scores = np.squeeze(model.predict(X_valid_pos))
    iso_pos = pos_iso(IsotonicRegression(out_of_bounds='clip'), val_pos_scores, corruptions, positive_base_rate=0.5)
    probas4 = iso_pos.predict(np.squeeze(scores).astype(float))

    sc_pos = pos_iso(_SigmoidCalibration(), val_pos_scores, corruptions, positive_base_rate=0.5)
    print("pos sc", sc_pos.a_, sc_pos.b_)
    probas5 = sc_pos.predict(np.squeeze(scores).astype(float))
    
    val_neg_scores = np.squeeze(model.predict(X_valid_neg))
    sc_pos_neg = pos_iso(_SigmoidCalibration(), val_pos_scores, val_neg_scores, positive_base_rate=0.5)
    print("pos neg sc", sc_pos_neg.a_, sc_pos_neg.b_)
    probas6 = sc_pos_neg.predict(np.squeeze(scores).astype(float))
    
    thresholds = {r: np.median(np.sort(val_scores[X['valid'][:, 1] == r])) for r in np.unique(X['valid'][:, 1])}
    thresholds_test = np.vectorize(thresholds.get)(X['test'][:, 1])
    per_relation_acc = accuracy_score(X['test_labels'] == "1", scores > thresholds_test)

    acc_uncalib = accuracy_score(X['test_labels'] == "1", expit(scores) > 0.5)

    acc1 = accuracy_score(X['test_labels'] == "1", probas1 > 0.5)
    acc2 = accuracy_score(X['test_labels'] == "1", probas2 > 0.5)
    acc3 = accuracy_score(X['test_labels'] == "1", probas3 > 0.5)
    acc4 = accuracy_score(X['test_labels'] == "1", probas4 > 0.5)
    acc5 = accuracy_score(X['test_labels'] == "1", probas5 > 0.5)
    acc6 = accuracy_score(X['test_labels'] == "1", probas6 > 0.5)
    
    filter_triples = np.concatenate((X['train'], X_valid_pos, X_test_pos))
    ranks = evaluate_performance(X_test_pos, 
                                 model=model, 
                                 filter_triples=filter_triples,
                                 use_default_protocol=True, 
                                 verbose=False)

    results.append({
        'model': m.__name__,
        'loss': l,
        'brier_score_scores': brier_score_loss(X['test_labels'] == "1", expit(scores)),
        'log_loss_scores': log_loss(X['test_labels'] == "1", expit(scores), eps=1e-7),
        'brier_score_probas_pos': brier_score_loss(X['test_labels'] == "1", probas1),
        'log_loss_probas_pos': log_loss(X['test_labels'] == "1", probas1, eps=1e-7),
        'brier_score_probas_pos_neg': brier_score_loss(X['test_labels'] == "1", probas2),
        'log_loss_probas_pos_neg': log_loss(X['test_labels'] == "1", probas2, eps=1e-7),
        'brier_score_probas_pos_neg_iso': brier_score_loss(X['test_labels'] == "1", probas3),
        'log_loss_probas_pos_neg_iso': log_loss(X['test_labels'] == "1", probas3, eps=1e-7),
        'brier_score_probas_pos_iso': brier_score_loss(X['test_labels'] == "1", probas4),
        'log_loss_probas_pos_iso': log_loss(X['test_labels'] == "1", probas4, eps=1e-7),
        'brier_score_probas_pos_sc': brier_score_loss(X['test_labels'] == "1", probas5),
        'log_loss_probas_pos_sc': log_loss(X['test_labels'] == "1", probas5, eps=1e-7),
        'brier_score_probas_pos_neg_sc': brier_score_loss(X['test_labels'] == "1", probas6),
        'log_loss_probas_pos_neg_sc': log_loss(X['test_labels'] == "1", probas6, eps=1e-7),
        'metrics_mrr': mrr_score(ranks), 
        'metrics_hits@10': hits_at_n_score(ranks, n=10),
        'metrics_mr': mr_score(ranks),
        'accuracy_per_relation': per_relation_acc,
        'accuracy_uncalib': acc_uncalib,
        'accuracy_pos': acc1,
        'accuracy_pos_neg': acc2,
        'accuracy_pos_neg_iso': acc3,
        'accuracy_pos_iso': acc4,
        'accuracy_pos_sc': acc5,
        'accuracy_pos_neg_sc': acc6
    })
        
    print(json.dumps(results[-1], indent=2))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.random.categorical instead.
Instructions for updating:
Use tf.cast instead.
pos [-1.9906234, -7.067369]
pos neg [-1.8516353, -5.5848007]
pos sc -2.3224234379544813 -8.26226106292651
pos neg sc -1.8523836160560438 -5.596022656508945
{
  "model": "TransE",
  "loss": "self_adversarial",
  "brier_score_scores": 0.3630828071709412,
  "log_loss_scores": 1.0624504733201046,
  "brier_score_probas_pos": 0.105592804643

In [19]:
import pandas as pd

In [20]:
def highlight_min(s):
    is_min = s == s.min()
    return ['font-weight: bold' if v else '' for v in is_min]

In [21]:
df = pd.DataFrame(results).set_index(['model', 'loss'])

In [22]:
bs = df[(c for c in df.columns if c.startswith('brier'))]
bs.columns = [c[len("brier_score_"):] for c in bs.columns]
bs.style.apply(highlight_min, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,probas_pos,probas_pos_iso,probas_pos_neg,probas_pos_neg_iso,probas_pos_neg_sc,probas_pos_sc,scores
model,loss,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TransE,self_adversarial,0.105593,0.10962,0.0949254,0.0927076,0.0948816,0.106362,0.363083
TransE,pairwise,0.152428,0.112938,0.123208,0.103209,0.123322,0.139535,0.497721
TransE,nll,0.235741,0.200139,0.18753,0.170232,0.18759,0.234999,0.217834
TransE,multiclass_nll,0.126562,0.116082,0.111302,0.103911,0.111273,0.122762,0.497795
DistMult,self_adversarial,0.0928978,0.0886699,0.0813233,0.0785058,0.0813693,0.0929009,0.284488
DistMult,pairwise,0.149369,0.133468,0.129219,0.115388,0.129325,0.148927,0.371667
DistMult,nll,0.0768235,0.0761126,0.0717517,0.0690451,0.071781,0.0768545,0.104989
DistMult,multiclass_nll,0.150234,0.134586,0.126563,0.112589,0.126658,0.14929,0.456301
ComplEx,self_adversarial,0.0972693,0.0948809,0.0893608,0.0843688,0.0894297,0.097268,0.264185
ComplEx,pairwise,0.130981,0.12295,0.114808,0.106525,0.114892,0.131137,0.321774


In [23]:
ll = df[(c for c in df.columns if c.startswith('log_loss'))]
ll.columns = [c[len("log_loss_"):] for c in ll.columns]
ll.style.apply(highlight_min, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,probas_pos,probas_pos_iso,probas_pos_neg,probas_pos_neg_iso,probas_pos_neg_sc,probas_pos_sc,scores
model,loss,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TransE,self_adversarial,0.370369,0.376181,0.318729,0.308514,0.318733,0.38832,1.06245
TransE,pairwise,0.486504,0.392935,0.44455,0.351752,0.444555,0.478483,4.92195
TransE,nll,0.680738,0.621722,0.577333,0.518347,0.577329,0.679198,0.626282
TransE,multiclass_nll,0.427793,0.440544,0.392977,0.350676,0.392981,0.474568,7.85788
DistMult,self_adversarial,0.310696,0.307953,0.279444,0.265872,0.279448,0.310797,1.04257
DistMult,pairwise,0.451408,0.398292,0.415302,0.356361,0.415307,0.450355,1.46674
DistMult,nll,0.263119,0.261522,0.247208,0.231766,0.247211,0.263137,0.71899
DistMult,multiclass_nll,0.453266,0.405863,0.407079,0.351802,0.407085,0.450874,4.22456
ComplEx,self_adversarial,0.322934,0.312896,0.305282,0.277734,0.305286,0.322939,1.19863
ComplEx,pairwise,0.404138,0.368594,0.373314,0.329283,0.373318,0.404576,1.06695


In [24]:
print((bs.reset_index()
 .query("loss == 'self_adversarial' ")
 [['model', 'scores', 'probas_pos_neg', 'probas_pos_neg_iso', 'probas_pos', 'probas_pos_iso']]
 .reset_index(drop=True)
 .round(3)
 .to_latex()))

\begin{tabular}{llrrrrr}
\toprule
{} &     model &  scores &  probas\_pos\_neg &  probas\_pos\_neg\_iso &  probas\_pos &  probas\_pos\_iso \\
\midrule
0 &    TransE &   0.363 &           0.095 &               0.093 &       0.106 &           0.110 \\
1 &  DistMult &   0.284 &           0.081 &               0.079 &       0.093 &           0.089 \\
2 &   ComplEx &   0.264 &           0.089 &               0.084 &       0.097 &           0.095 \\
\bottomrule
\end{tabular}



In [25]:
print(ll.reset_index()
 .query("loss == 'self_adversarial' ")
 [['model', 'scores', 'probas_pos_neg', 'probas_pos_neg_iso', 'probas_pos', 'probas_pos_iso']]
 .reset_index(drop=True)
  .round(3)
 .to_latex())

\begin{tabular}{llrrrrr}
\toprule
{} &     model &  scores &  probas\_pos\_neg &  probas\_pos\_neg\_iso &  probas\_pos &  probas\_pos\_iso \\
\midrule
0 &    TransE &   1.062 &           0.319 &               0.309 &       0.370 &           0.376 \\
1 &  DistMult &   1.043 &           0.279 &               0.266 &       0.311 &           0.308 \\
2 &   ComplEx &   1.199 &           0.305 &               0.278 &       0.323 &           0.313 \\
\bottomrule
\end{tabular}



In [33]:
print((acc*100).reset_index()
 .query("loss == 'self_adversarial' ")
 [['model', 'pos_neg', 'pos_neg_iso', 'pos', 'pos_iso',  'uncalib', 'per_relation']]
 .reset_index(drop=True)
  .round(1)
 .to_latex())

\begin{tabular}{llrrrrrr}
\toprule
{} &     model &  pos\_neg &  pos\_neg\_iso &   pos &  pos\_iso &  uncalib &  per\_relation \\
\midrule
0 &    TransE &     87.1 &         87.8 &  86.6 &     84.9 &     50.2 &          88.8 \\
1 &  DistMult &     88.9 &         89.3 &  87.7 &     88.5 &     56.7 &          90.2 \\
2 &   ComplEx &     87.3 &         88.2 &  86.6 &     87.2 &     61.1 &          89.4 \\
\bottomrule
\end{tabular}



In [27]:
metrics = df[(c for c in df.columns if c.startswith('metrics'))]
metrics.columns = [c[len("metrics_"):] for c in metrics.columns]
metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,hits@10,mr,mrr
model,loss,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TransE,self_adversarial,0.319041,244.604493,0.168512
TransE,pairwise,0.648666,500.20526,0.371433
TransE,nll,0.112647,1014.709742,0.063875
TransE,multiclass_nll,0.588346,216.314559,0.323279
DistMult,self_adversarial,0.619883,635.099201,0.30557
DistMult,pairwise,0.631116,648.168485,0.337314
DistMult,nll,0.647262,744.19824,0.330295
DistMult,multiclass_nll,0.585268,397.900475,0.308723
ComplEx,self_adversarial,0.752673,1074.095853,0.530927
ComplEx,pairwise,0.729776,586.873528,0.476573


In [28]:
def highlight_max(s):
    is_min = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_min]

acc = df[(c for c in df.columns if c.startswith('accuracy'))]
acc.columns = [c[len("accuracy_"):] for c in acc.columns]
acc.style.apply(highlight_max, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,per_relation,pos,pos_iso,pos_neg,pos_neg_iso,pos_neg_sc,pos_sc,uncalib
model,loss,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TransE,self_adversarial,0.887581,0.865591,0.84914,0.871344,0.877796,0.871505,0.864892,0.502204
TransE,pairwise,0.898226,0.822796,0.865968,0.862796,0.871398,0.862688,0.823226,0.502204
TransE,nll,0.779624,0.665538,0.749301,0.714409,0.759624,0.713333,0.668602,0.502204
TransE,multiclass_nll,0.909731,0.843441,0.857742,0.866505,0.86957,0.866505,0.842688,0.502204
DistMult,self_adversarial,0.901882,0.877204,0.885376,0.889194,0.893118,0.889032,0.877151,0.567151
DistMult,pairwise,0.901774,0.782258,0.766022,0.818226,0.842097,0.817742,0.782634,0.540806
DistMult,nll,0.915269,0.897151,0.901882,0.903817,0.904462,0.904032,0.897043,0.883763
DistMult,multiclass_nll,0.909247,0.78172,0.786882,0.817312,0.843925,0.817419,0.78328,0.522688
ComplEx,self_adversarial,0.893817,0.866452,0.871559,0.872903,0.881828,0.872849,0.866452,0.610806
ComplEx,pairwise,0.902527,0.816774,0.803387,0.843495,0.852796,0.843441,0.81672,0.560645


In [30]:
df.corr(method='spearman')

Unnamed: 0,accuracy_per_relation,accuracy_pos,accuracy_pos_iso,accuracy_pos_neg,accuracy_pos_neg_iso,accuracy_pos_neg_sc,accuracy_pos_sc,accuracy_uncalib,brier_score_probas_pos,brier_score_probas_pos_iso,...,log_loss_probas_pos,log_loss_probas_pos_iso,log_loss_probas_pos_neg,log_loss_probas_pos_neg_iso,log_loss_probas_pos_neg_sc,log_loss_probas_pos_sc,log_loss_scores,metrics_hits@10,metrics_mr,metrics_mrr
accuracy_per_relation,1.0,0.398601,0.447552,0.405594,0.384615,0.405594,0.41958,0.555248,-0.496503,-0.342657,...,-0.48951,-0.454545,-0.51049,-0.475524,-0.51049,-0.454545,0.06993,0.370629,-0.160839,0.342657
accuracy_pos,0.398601,1.0,0.965035,0.993007,0.986014,0.993007,0.993007,0.619315,-0.93007,-0.986014,...,-0.895105,-0.818182,-0.888112,-0.937063,-0.888112,-0.832168,-0.293706,0.391608,0.174825,0.300699
accuracy_pos_iso,0.447552,0.965035,1.0,0.958042,0.979021,0.958042,0.972028,0.612197,-0.839161,-0.965035,...,-0.804196,-0.797203,-0.811189,-0.888112,-0.811189,-0.741259,-0.195804,0.454545,0.188811,0.363636
accuracy_pos_neg,0.405594,0.993007,0.958042,1.0,0.979021,1.0,0.986014,0.626434,-0.923077,-0.979021,...,-0.888112,-0.811189,-0.881119,-0.93007,-0.881119,-0.825175,-0.300699,0.370629,0.167832,0.265734
accuracy_pos_neg_iso,0.384615,0.986014,0.979021,0.979021,1.0,0.979021,0.993007,0.612197,-0.888112,-0.986014,...,-0.867133,-0.832168,-0.874126,-0.937063,-0.874126,-0.818182,-0.286713,0.398601,0.174825,0.300699
accuracy_pos_neg_sc,0.405594,0.993007,0.958042,1.0,0.979021,1.0,0.986014,0.626434,-0.923077,-0.979021,...,-0.888112,-0.811189,-0.881119,-0.93007,-0.881119,-0.825175,-0.300699,0.370629,0.167832,0.265734
accuracy_pos_sc,0.41958,0.993007,0.972028,0.986014,0.993007,0.986014,1.0,0.612197,-0.923077,-0.979021,...,-0.888112,-0.811189,-0.895105,-0.944056,-0.895105,-0.825175,-0.27972,0.370629,0.146853,0.27972
accuracy_uncalib,0.555248,0.619315,0.612197,0.626434,0.612197,0.626434,0.612197,1.0,-0.733212,-0.626434,...,-0.804398,-0.882702,-0.797279,-0.754568,-0.797279,-0.847109,-0.44491,0.637112,0.558807,0.555248
brier_score_probas_pos,-0.496503,-0.93007,-0.839161,-0.923077,-0.888112,-0.923077,-0.923077,-0.733212,1.0,0.888112,...,0.979021,0.832168,0.972028,0.951049,0.972028,0.93007,0.370629,-0.370629,-0.181818,-0.286713
brier_score_probas_pos_iso,-0.342657,-0.986014,-0.965035,-0.979021,-0.986014,-0.979021,-0.979021,-0.626434,0.888112,1.0,...,0.867133,0.846154,0.86014,0.923077,0.86014,0.818182,0.321678,-0.405594,-0.223776,-0.328671


In [34]:
len(np.unique(X['valid'][:, 1]))

33