In [9]:
import sys
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [10]:
import gc
import os
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier,CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier


In [11]:
nfolds= 25
kfold = StratifiedKFold(n_splits=nfolds, random_state=42, shuffle=True)
train_df = pd.read_csv("/home/wangjingqi/input/dataset/g2net/train_labels.csv")
use_generated_data = True
if use_generated_data:
    generated_data = pd.read_csv("/home/wangjingqi/input/dataset/g2net/generted_train/generted_train_labels.csv")
    train_df = pd.concat([train_df,generated_data],axis=0).reset_index(drop=True)
test_df = pd.read_csv("/home/wangjingqi/input/dataset/g2net/sample_submission.csv")
train_df = train_df[train_df.target != -1].reset_index(drop=True)
for nfold, (train_idx, val_idx) in enumerate(kfold.split(train_df.id.values, train_df.target.values)):
    train_df.loc[val_idx, 'fold'] = int(nfold)


In [12]:
print(" train_df.shape = ", train_df.shape)

 train_df.shape =  (6600, 3)


**Model**

In [13]:
from torch import nn
import timm
from torch.utils.data import DataLoader, Dataset
class Model(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        encoder = timm.create_model(model_name, pretrained=True, in_chans=2,drop_rate=0., drop_path_rate=0.)
        clsf = encoder.default_cfg['classifier']
        encoder._modules[clsf] = nn.Identity()
        self.encoder = encoder
    def forward(self, x):
        x = self.encoder(x)
        return x
class G2NetDataset(Dataset):
    def __init__(self, dir,df):
        self.dir = dir
        self.ids =  df.id.values
        self.labels = df.target.values
    def __len__(self):
        return len(self.ids)
    @classmethod
    def process(cls, data: np.ndarray) -> np.ndarray:
        data = data* 1e22
        data = data.imag**2 + data.real**2
        data = data/data.mean()
        x = np.zeros((360, 4096))
        x[:, :data.shape[-1]] = data[:,:4096]
        return x
    
    def __getitem__(self, idx):
        #load data
        id = self.ids[idx]
        if os.path.exists(os.path.join(self.dir[0],id+".npy")):
            x = np.load(os.path.join(self.dir[0],id+".npy"),allow_pickle=True).item()
        else:
            x = np.load(os.path.join(self.dir[1],id+".npy"),allow_pickle=True).item()
        #process data
        h1 = self.process(x["H1"])
        l1 = self.process(x["L1"])
        x = np.concatenate((np.expand_dims(h1,axis=0),np.expand_dims(l1,axis=0)),axis=0)
        #data augmentation
        x = np.transpose(x,(1,2,0))
        x = np.mean(x.reshape(360,128,32,2), axis=2)
        x = np.transpose(x,(2,0,1))
        x = torch.from_numpy(x)
        x = x.float()
        return x


In [14]:

import os
import torch
from torch.utils.data import DataLoader
def get_embeddings(model_name='',dir=None,df=None,verbose=True):

    model = Model(model_name)
    model.load_state_dict(torch.load(f'/home/wangjingqi/input/ck/g2net/tf_efficientnet_b7_ns_2000/tf_efficientnet_b7_ns_2000_0.pth',map_location='cpu'),strict=False)
    dataset = G2NetDataset(dir,df)
    dataloader = DataLoader(dataset,batch_size =32,num_workers=8, shuffle = False, pin_memory=True,drop_last=False)
    model.cuda()
    model.eval()
    model.float()

    all_text_feats = []
    for batch in tqdm(dataloader,total=len(dataloader)):
        batch = batch.cuda()
        with torch.no_grad():
            model_output = model(batch).detach().cpu()
        # Normalize the embeddings
        sentence_embeddings = F.normalize(model_output, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings
        all_text_feats.append(sentence_embeddings)
    all_text_feats = torch.cat(all_text_feats,dim=0)
    if verbose:
        print(' embeddings shape',all_text_feats.shape)
        
    return all_text_feats.numpy()

In [15]:
from tqdm import tqdm

In [16]:

models= ["tf_efficientnet_b7_ns",]
all_train_feats = []
for m in models:
    text_feats = get_embeddings(model_name=m,dir=["/home/wangjingqi/input/dataset/g2net/train","/home/wangjingqi/input/dataset/g2net/generted_train"],df=train_df,)
    all_train_feats.append(text_feats)
all_train_feats = np.concatenate(all_train_feats,axis=1)
print(all_train_feats.shape)

all_test_feats = []
for m in models:
    text_feats = get_embeddings(model_name=m,dir=["/home/wangjingqi/input/dataset/g2net/test"],df=test_df,)
    all_test_feats.append(text_feats)
all_test_feats = np.concatenate(all_test_feats,axis=1)
print(all_test_feats.shape)

100%|██████████| 207/207 [01:29<00:00,  2.32it/s]


 embeddings shape torch.Size([6600, 2560])
(6600, 2560)


100%|██████████| 250/250 [00:49<00:00,  5.06it/s]


 embeddings shape torch.Size([7975, 2560])
(7975, 2560)


In [17]:
import os

In [18]:
from sklearn.metrics import mean_squared_error
import torch
from sklearn.metrics import roc_auc_score, accuracy_score

preds = []
scores = []

for fold in range(nfolds):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    train_ = train_df[train_df.fold!=fold]
    val_ = train_df[train_df.fold==fold]
    tr_text_feats = all_train_feats[list(train_.index),:]
    val_text_feats = all_train_feats[list(val_.index),:]

    clf = XGBClassifier()
    clf.fit(tr_text_feats, train_["target"].values)
    val_preds = clf.predict(val_text_feats)
    test_preds = clf.predict(all_test_feats)
    labels = np.array(val_["target"].values.tolist())
    score = roc_auc_score(labels,val_preds)
    print('Fold',fold+1,'roc =',score)
    scores.append(score)

    preds.append(test_preds)
print('Overall CV roc =',np.mean(scores))

#########################
### Fold 1
#########################
Fold 1 roc = 0.9446231617647058
#########################
### Fold 2
#########################
Fold 2 roc = 0.9558823529411764
#########################
### Fold 3
#########################
Fold 3 roc = 0.9558823529411764
#########################
### Fold 4
#########################
Fold 4 roc = 0.9705882352941176
#########################
### Fold 5
#########################
Fold 5 roc = 0.9889705882352942
#########################
### Fold 6
#########################
Fold 6 roc = 0.9811580882352942
#########################
### Fold 7
#########################
Fold 7 roc = 0.9632352941176471
#########################
### Fold 8
#########################
Fold 8 roc = 0.9705882352941176
#########################
### Fold 9
#########################
Fold 9 roc = 0.9632352941176471
#########################
### Fold 10
#########################
Fold 10 roc = 0.9740349264705883
#########################
### Fold 11
##########

In [19]:
sub = test_df.copy()
sub.loc[:,"target"] = np.average(np.array(preds),axis=0)

In [20]:
sub.to_csv("ml_submission.csv",index=None)
sub.head()

Unnamed: 0,id,target
0,00054c878,0.0
1,0007285a3,1.0
2,00076c5a6,0.0
3,001349290,0.0
4,001a52e92,0.0
