In [3]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from numpy.random import seed
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.metrics import roc_auc_score, accuracy_score
import sqlite3

import xgboost as xgb

In [3]:
X = np.load('/kaggle/input/t5embeds/train_embeds.npy')
Y = np.load('/kaggle/input/xgbdata/Y_1499.npy')

In [6]:
X.shape

(142246, 1024)

In [8]:
X_test = np.load('/kaggle/input/t5embeds/test_embeds.npy')


In [3]:
X_test.shape

(141865, 1024)

In [4]:
test_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')

In [5]:
IX = np.arange(len(X_test))

In [7]:
IX_train, IX_test, _,_ = train_test_split( IX, IX, train_size=0.2, random_state=42)

## AUTOENCODER:

In [8]:
encoding_dim = 200
input_dim = Input(shape = (X.shape[1], ))

# Encoder Layers
encoded1 = Dense(3000, activation = 'relu')(input_dim)
encoded2 = Dense(2750, activation = 'relu')(encoded1)
encoded3 = Dense(2500, activation = 'relu')(encoded2)
encoded4 = Dense(2250, activation = 'relu')(encoded3)
encoded5 = Dense(2000, activation = 'relu')(encoded4)
encoded6 = Dense(1750, activation = 'relu')(encoded5)
encoded7 = Dense(1500, activation = 'relu')(encoded6)
encoded8 = Dense(1250, activation = 'relu')(encoded7)
encoded9 = Dense(1000, activation = 'relu')(encoded8)
encoded10 = Dense(750, activation = 'relu')(encoded9)
encoded11 = Dense(500, activation = 'relu')(encoded10)
encoded12 = Dense(250, activation = 'relu')(encoded11)
encoded13 = Dense(encoding_dim, activation = 'relu')(encoded12)

decoded1 = Dense(250, activation = 'relu')(encoded13)
decoded2 = Dense(500, activation = 'relu')(decoded1)
decoded3 = Dense(750, activation = 'relu')(decoded2)
decoded4 = Dense(1000, activation = 'relu')(decoded3)
decoded5 = Dense(1250, activation = 'relu')(decoded4)
decoded6 = Dense(1500, activation = 'relu')(decoded5)
decoded7 = Dense(1750, activation = 'relu')(decoded6)
decoded8 = Dense(2000, activation = 'relu')(decoded7)
decoded9 = Dense(2250, activation = 'relu')(decoded8)
decoded10 = Dense(2500, activation = 'relu')(decoded9)
decoded11 = Dense(2750, activation = 'relu')(decoded10)
decoded12 = Dense(3000, activation = 'relu')(decoded11)
decoded13 = Dense(X.shape[1], activation = 'sigmoid')(decoded12)
# Combine Encoder and Deocder layers
autoencoder = Model(inputs = input_dim, outputs = decoded13)

# Compile the Model
autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')

In [9]:
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1024)]            0         
                                                                 
 dense (Dense)               (None, 3000)              3075000   
                                                                 
 dense_1 (Dense)             (None, 2750)              8252750   
                                                                 
 dense_2 (Dense)             (None, 2500)              6877500   
                                                                 
 dense_3 (Dense)             (None, 2250)              5627250   
                                                                 
 dense_4 (Dense)             (None, 2000)              4502000   
                                                                 
 dense_5 (Dense)             (None, 1750)              350175

In [10]:
autoencoder.fit(X_test[IX_train,:], X_test[IX_train,:], epochs = 10, batch_size = 32, shuffle = False, validation_data = (X_test[IX_test,:], X_test[IX_test,:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ab23635f130>

In [11]:
encoder = Model(inputs = input_dim, outputs = encoded13)
encoded_input = Input(shape = (encoding_dim, ))

In [20]:
encoded_train = pd.DataFrame(encoder.predict(X[IX_train,:]))
encoded_train = encoded_train.add_prefix('feature_')

encoded_test = pd.DataFrame(encoder.predict(X[IX_test,:]))
encoded_test = encoded_test.add_prefix('feature_')



In [12]:
encoded_test_set = pd.DataFrame(encoder.predict(X_test))



In [25]:
encoded_train.to_csv('train_encoded.csv', index=False)
encoded_test.to_csv('test_encoded.csv', index=False)

In [13]:
encoded_test_set.to_csv('test_set_encoded.csv', index=False)

### TRAINING ON REDUCED EMBEDDINGS

In [9]:
encoded_test_set = pd.read_csv('/kaggle/working/test_set_encoded.csv')

In [7]:
encoded_train = pd.read_csv('/kaggle/input/cafa-encoder-200/train_encoded.csv')
encoded_test = pd.read_csv('/kaggle/input/cafa-encoder-200/test_encoded.csv')

In [8]:
encoded_train_array = np.asarray(encoded_train)

In [9]:
encoded_test_array = np.asarray(encoded_test)

In [10]:
encoded_test_set_array = np.asarray(encoded_test_set)

In [11]:
encoded_test_set_array.shape

(141865, 200)

In [10]:
encoded_train_array.shape

(14224, 200)

In [11]:
IX_train.shape

(14224,)

In [12]:
clf_xgb2 = xgb.XGBClassifier(objective="binary:logistic",
                            random_state=42,
                            tree_method = "gpu_hist",
                            verbosity=2)

In [13]:
clf_xgb2.fit(encoded_train_array,Y[IX_train,:])

In [14]:
clf_xgb2.save_model("model_encoder_200.json")

In [11]:
y_pred_test = clf_xgb2.predict(encoded_test)

In [14]:
l = []
for i in range(Y.shape[1]):
    if len(np.unique(Y[IX_test,i]) ) > 1:
        s = roc_auc_score(Y[IX_test,i], y_pred_test[:,i]);
    else:
        s = 0.5
    l.append(s)        
    if i %10 == 0:
        print(i, s)

0 0.5567492120421653
10 0.5850178113702773
20 0.5603748721121858
30 0.5222977582257764
40 0.5234311339541686
50 0.515711562938075
60 0.5389039667431353
70 0.5252167428117986
80 0.5253520487055973
90 0.5134093311746266
100 0.5345458289775508
110 0.5145544951342869
120 0.5069059011577757
130 0.5101472207245163
140 0.5387403583481057
150 0.5094637993339062
160 0.5118716418942365
170 0.5198563506799315
180 0.5124880934883946
190 0.5092148530455994
200 0.5163452445470603
210 0.5098348667550139
220 0.5191573422738698
230 0.514868640264882
240 0.5121243171503156
250 0.509790188896442
260 0.5269388660522586
270 0.5113632725888518
280 0.5115406979193142
290 0.5338702831200955
300 0.5356247440836592
310 0.5108221554399702
320 0.5057870590287372
330 0.5087778050692369
340 0.506357061951778
350 0.5138248564775965
360 0.5092315614531709
370 0.5029379523235463
380 0.5058380628597668
390 0.5173421258264054
400 0.502900841133913
410 0.5183679900076431
420 0.5413128325153767
430 0.5193509559315709
440 

In [15]:
acc = accuracy_score(Y[IX_test,:],y_pred_test)

In [16]:
acc

0.005350642858258737

# PREDICTION FOR COMPLETE TEST SET

In [13]:
clf_xgb2.load_model('/kaggle/working/model_encoder_200.json')

In [14]:
y_pred_raw = clf_xgb2.predict(encoded_test_set,output_margin = True)

In [15]:
np.save("xgb-encoded-prediction-full-raw-20-06",y_pred_raw)

In [16]:
test_data_batch_size = 35000

In [18]:
y_pred_probabilities = 1 / (1 + np.exp(-y_pred_raw[test_data_batch_size,:]))
y_pred_probabilities

array([0.00117109, 0.24234429, 0.21575215, ..., 0.02121002, 0.2706007 ,
       0.02720496], dtype=float32)

In [19]:
np.save("xgb-encoded-prediction-test-set-35000-probabilities-20-06", y_pred_probabilities)