In [17]:
from IPython.display import clear_output

!pip install lightning timm opendatasets albumentations catboost gdown
clear_output()

In [18]:
!nvidia-smi

Thu Aug 17 17:11:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   32C    P0    52W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from math import sin,cos,pi
from sklearn.metrics import accuracy_score,f1_score,balanced_accuracy_score
import albumentations as A
from arcface import ArcFaceLoss,ElasticArcFace,CosFace,GeM
from albumentations.pytorch.transforms import ToTensorV2
from catboost import CatBoostClassifier,Pool,cv
from activations import NewGELUActivation
from copy import deepcopy
import timm
pl.seed_everything(56)

Global seed set to 56


56

In [3]:
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [4]:
timm.list_models(pretrained=True)

['bat_resnext26ts.ch_in1k',
 'beit_base_patch16_224.in22k_ft_in22k',
 'beit_base_patch16_224.in22k_ft_in22k_in1k',
 'beit_base_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_224.in22k_ft_in22k',
 'beit_large_patch16_224.in22k_ft_in22k_in1k',
 'beit_large_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_512.in22k_ft_in22k_in1k',
 'beitv2_base_patch16_224.in1k_ft_in1k',
 'beitv2_base_patch16_224.in1k_ft_in22k',
 'beitv2_base_patch16_224.in1k_ft_in22k_in1k',
 'beitv2_large_patch16_224.in1k_ft_in1k',
 'beitv2_large_patch16_224.in1k_ft_in22k',
 'beitv2_large_patch16_224.in1k_ft_in22k_in1k',
 'botnet26t_256.c1_in1k',
 'caformer_b36.sail_in1k',
 'caformer_b36.sail_in1k_384',
 'caformer_b36.sail_in22k',
 'caformer_b36.sail_in22k_ft_in1k',
 'caformer_b36.sail_in22k_ft_in1k_384',
 'caformer_m36.sail_in1k',
 'caformer_m36.sail_in1k_384',
 'caformer_m36.sail_in22k',
 'caformer_m36.sail_in22k_ft_in1k',
 'caformer_m36.sail_in22k_ft_in1k_384',
 'caformer_s18.sail_in1k',
 'caformer_s18.s

In [5]:
class CFG:
    train_path='/notebooks/new-train/train'
    test_path = '/notebooks/simple_subv30.csv'
    model = 'tf_efficientnetv2_l.in21k'
    h,w = (512,1024)
    num_workers = 4
    val_split_size = 0.2
    num_labels = 2
    scheduler= False
    warnap = 0.04
    max_epoches=1
    lr = 1e-4
    min_lr= 5e-6
    p=3
    eps=1e-6
    betas=(0.9, 0.999)
    weight_decay = 1e-6
    margin = 0.35 # 0.3
    scale = 30.0
    batch_size= 4
    seed=56

In [6]:
def make_df(path):
    images,labels = [],[]
    for label in os.listdir(path):
        for image_p in os.listdir(path+'/'+label):
            labels += [label]
            images += [path+'/'+label+'/'+image_p]
    df = pd.DataFrame()
    df['image'] = images
    df['label'] = labels
    df['label'] = df['label'].apply(lambda x:1 if x == 'seal' else 0)
    return df

def make_test_df(path,root_path="/notebooks/hac-test/test/"):
    df = pd.read_csv(path)
    df['label'] = 0;
    df['image'] = df['filename'].apply(lambda x: root_path + x)
    return df.drop(['class','filename'],axis=1)

In [7]:
def make_df(path):
    images,labels = [],[]
    for label in os.listdir(path):
        for image_p in os.listdir(path+'/'+label):
            labels += [label]
            images += [path+'/'+label+'/'+image_p]
    df = pd.DataFrame()
    df['image'] = images
    df['label'] = labels
    df['label'] = df['label'].apply(lambda x:1 if x == 'seal' else 0)
    return df

def make_test_df(path,root_path="/notebooks/hac-test/test/"):
    df = pd.read_csv(path)
    df['label'] = 0;
    df['image'] = df['filename'].apply(lambda x: root_path + x)
    return df.drop(['class','filename'],axis=1)

def to_time_stemp(path):
    path = path.split('/')[-1].split('.')[0][:-7]
    time_stemp = pd.Timestamp(path[:4]+'-'+path[4:6]+'-'+path[6:8]+' '+path[9:11]+':' + \
                              path[11:13] + ':' + path[13:15])
    return time_stemp

def make_base_time_features(df):
    df['month'] = df['dt'].apply(lambda x:x.month)
    df['day'] = df['dt'].apply(lambda x:x.day)
    df['week'] = df['dt'].apply(lambda x:x.week)
    df['hour'] = df['dt'].apply(lambda x:x.hour)
    df['minute'] = df['dt'].apply(lambda x:x.minute)
    df['quarter'] = df['dt'].apply(lambda x:x.quarter)
    
    df['weekofyear'] = df['dt'].apply(lambda x:x.weekofyear)
    df['dayofweek'] = df['dt'].apply(lambda x:x.dayofweek)
    df['week'] = df['dt'].apply(lambda x:x.week)
    df['hour'] = df['dt'].apply(lambda x:x.hour)
    df['minute'] = df['dt'].apply(lambda x:x.minute)
    
    df['all_day_time'] = df['dt'].apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
    df['all_week_time'] = df['dt'].apply(lambda x:x.dayofweek * 24 + x.hour)
    df['all_time'] = df['dt'].apply(lambda x:x.dayofyear * 24 * 3600 + x.hour * 3600 + x.minute * 60 + x.second)
    df['all_month_time'] = df['dt'].apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
    df['hour_time'] = df['dt'].apply(lambda x:x.minute * 60 + x.second)
    return df

def get_polynoms_from_column(df,col):
    min_v = df[col].min()
    max_v = df[col].max()
    
    df[f'sin_{col}'] = df[col].apply(sin)
    df[f'cos_{col}'] = df[col].apply(cos)
    df[f'sin_{col}^2'] = df[col].apply(sin) * df[col].apply(sin)
    df[f'cos_{col}^2'] = df[col].apply(cos) * df[col].apply(cos)
    
    df[f'{col}_sin'] = df[col].apply(lambda x: sin((x - min_v) / max_v * 2 * pi))
    df[f'{col}_cos'] = df[col].apply(lambda x: cos((x - min_v) / max_v * 2 * pi))
    df[f'{col}_sin^2'] = df[f'{col}_sin'] * df[f'{col}_sin']
    df[f'{col}_cos^2'] = df[f'{col}_cos'] * df[f'{col}_cos']
    return df

def get_dop_features(df):
    day = 24
    year = 365.2425*day
    df['vday_sin'] = (df.hour * 2 * np.pi / day)
    df['vday_sin'] = df['day_sin'].apply(sin)
    df['vday_cos'] = (df.hour * 2 * np.pi / day)
    df['vday_cos'] = df['day_cos'].apply(cos)
    df['vyear_sin'] = (df.hour * 2 * np.pi / year)
    df['vyear_sin'] = df['year_sin'].apply(sin)
    df['vyear_cos'] = (df.hour * 2 * np.pi / year)
    df['vyear_cos'] = df['year_cos'].apply(cos)
    
    return df

In [8]:
class DataColator():
    def __init__(self):
        self.transforms = A.Compose([A.Crop(x_max=4000), # 4000
                                     A.Normalize(),
                                     A.Resize(1024,2048), # 512,1024
                                     ToTensorV2()])
    def read_image(self,path):
        image = cv2.imread(path)
        return self.transforms(image=image)['image']

In [9]:
class PLDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.cfg = CFG()
        self.data = df[['image','label']]
        self.data = self.data.values
        self.dc = DataColator()
    def __getitem__(self, index):
        image = self.dc.read_image(self.data[index][0])
        label = self.data[index][1]
        return image,label
    def __len__(self):
        return len(self.data)

In [10]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,):
        super().__init__()
        self.cfg = CFG()
        self.train_dataset_path = self.cfg.train_path
        self.test_dataset_path = self.cfg.test_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
        
    def prepare_data(self):
        self.train_df = make_df(self.train_dataset_path)
        self.test_df = make_test_df(self.test_dataset_path)
        
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size,random_state=self.cfg.seed)
        self.train_dataset = PLDataset(self.train_df)
        self.val_dataset = PLDataset(self.val_df)
        self.predict_dataset = PLDataset(self.test_df)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.batch_size,
                         num_workers=self.num_workers,
                         shuffle=True)
    
    def train_dataloader_for_pred(self):
            return DataLoader(self.train_dataset,
                         batch_size=self.batch_size,
                         num_workers=self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size * 2,
                          num_workers=self.num_workers)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.batch_size * 2,
                          num_workers=self.num_workers)

In [11]:
params = {'iterations':800,
          'learning_rate':0.01,
          'loss_function':'CrossEntropy',
          'max_depth':4,
          'l2_leaf_reg':0.7,
          'eval_metric':'F1',
          'random_seed':56}

In [12]:
class PLModule(pl.LightningModule):
    def __init__(self,model):
        super().__init__()
        self.cfg = CFG()
        self.model = model
        self.head = ElasticArcFace(1056,2,m=CFG.margin,s=CFG.scale)
        self.global_pools = nn.ModuleList([GeM(p=CFG.p) for _ in range(5)])
        self.criterion = nn.CrossEntropyLoss()
        self.neck = nn.LayerNorm(1056)
        self.dm = PLDataModule();
        self.dm.prepare_data();
        self.dm.setup(0)
        self.losses = []
        
    def forward(self,x):
        features = self.model(x)
        features = torch.cat([global_pool(m) for m, global_pool in zip(features, self.global_pools)], dim=1)
        return self.neck(features)

    def training_step(self, batch, _):
        x,targets = batch
        logits = self(x)
        logits = self.head(logits, targets)
        loss = self.criterion(logits,targets)
        return loss

    def validation_step(self, batch, _):
        x,targets = batch
        self.losses += [self.criterion(self.head(self(x), targets),targets).tolist() ]
        
    def predict_step(self, batch, _):
        x,targets = batch
        return [self(x).cpu().detach().numpy()]
    
    def calc_metrics(self):
        return np.mean(self.losses)
    
    def get_preds(self,dl):
        preds = []
        for batch in dl:
            batch[0] = batch[0].to('cuda')
            batch[1] = batch[1].to('cuda')
            preds += [self.predict_step(batch,0)]
        return preds
        
    def evaluate_cb(self):
        train_features = self.get_preds(self.dm.train_dataloader_for_pred())
        val_features = self.get_preds(self.dm.val_dataloader())
        
        train_features = np.concatenate(train_features,axis=1)[0].T
        train_df = pd.DataFrame()
        train_df['label'] = self.dm.train_dataset.data[:,1]
        for i in range(len(train_features)):
            train_df[f'feature_{i}'] = train_features[i]
            
        val_features = np.concatenate(val_features,axis=1)[0].T
        val_df = pd.DataFrame()
        val_df['label'] = dm.val_df['label']
        for i in range(len(val_features)):
            val_df[f'feature_{i}'] = val_features[i]
        
        train_pool = Pool(train_df.drop('label',axis=1),label=train_df['label'])
        val_pool = Pool(val_df.drop('label',axis=1),label=val_df['label'])
        
        model = CatBoostClassifier(**params)
        model.fit(train_pool,eval_set=val_pool,verbose=False)
        print('f1: ',model.get_best_score()['validation']['F1'])
        
        
    def on_validation_epoch_end(self):
        print('loss:',self.calc_metrics())
        self.evaluate_cb()
        self.losses = []
            
    def configure_optimizers(self):
        grouped_parametets = [{'params':self.parameters(),
                               'lr': CFG.lr},]
                              #{'params':self.neck.parameters(),
                              # 'lr': CFG.lr},
                              #{'params':self.head.parameters(),
                              # 'lr':CFG.lr * 1}]
        optim = torch.optim.AdamW(grouped_parametets,
                                  betas=CFG.betas,
                                  weight_decay=CFG.weight_decay,
                                  eps=CFG.eps
                                 )
        return optim

In [13]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)

In [14]:
model = timm.create_model(CFG.model,
                          pretrained=True,
                          #num_classes=384,
                          #in_chans=3,
                          num_classes=0,
                          features_only=True,
                         )

In [15]:
pl_model = PLModule(model)

In [16]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    #filename='model_{epoch:02d}-{val_acc:.4f}',
    #monitor='val_acc',
    #mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision=32,
    callbacks=[lr_monitor, checkpoint_cb],
    min_epochs=1,
    devices=[0],
    check_val_every_n_epoch=1,
    max_epochs=8
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(pl_model,datamodule=dm) 

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                 | Params
------------------------------------------------------
0 | model        | EfficientNetFeatures | 116 M 
1 | head         | ElasticArcFace       | 2.1 K 
2 | global_pools | ModuleList           | 5     
3 | criterion    | CrossEntropyLoss     | 0     
4 | neck         | LayerNorm            | 2.1 K 
------------------------------------------------------
116 M     Trainable params
5         Non-trainable params
116 M     Total params
465.667   Total estimated model p

Sanity Checking: 0it [00:00, ?it/s]

loss: 10.287338256835938
f1:  0.74934036939314


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

loss: 1.9188300903886557
f1:  0.9437500000000001


Validation: 0it [00:00, ?it/s]

loss: 1.5843347652948327
f1:  0.9390243902439024


Validation: 0it [00:00, ?it/s]

loss: 1.4450010959591184
f1:  0.9411764705882353


Validation: 0it [00:00, ?it/s]

loss: 1.2241710184141994
f1:  0.9375000000000001


Validation: 0it [00:00, ?it/s]

loss: 1.3080751691811852
f1:  0.934984520123839


Validation: 0it [00:00, ?it/s]

loss: 1.184017769806087
f1:  0.9415384615384614


Validation: 0it [00:00, ?it/s]

loss: 1.356127891011004
f1:  0.9287925696594428


Validation: 0it [00:00, ?it/s]

loss: 1.5714568714145571


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [18]:
train_features = trainer.predict(pl_model,dm.train_dataloader_for_pred())
val_features = trainer.predict(pl_model,dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [19]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)

In [20]:
train_features = np.concatenate(train_features,axis=1)[0].T
train_df = pd.DataFrame()
train_df['label'] = dm.train_df['label']

for i in range(len(train_features)):
    train_df[f'feature_{i}'] = train_features[i]

In [21]:
val_features = np.concatenate(val_features,axis=1)[0].T
val_df = pd.DataFrame()
val_df['label'] = dm.val_df['label']

for i in range(len(val_features)):
    val_df[f'feature_{i}'] = val_features[i]

In [39]:
train_df['dt'] = dm.train_df['image'].apply(to_time_stemp)
train_df = make_base_time_features(train_df)

cols_for_pol = ['month','day','hour']
for col in cols_for_pol:
    train_df = get_polynoms_from_column(train_df,col)

val_df['dt'] = dm.val_df['image'].apply(to_time_stemp)
val_df = make_base_time_features(val_df)

cols_for_pol = ['month','day','hour']
for col in cols_for_pol:
    val_df = get_polynoms_from_column(val_df,col)

#train_df = get_dop_features(train_df)

  train_df['dt'] = dm.train_df['image'].apply(to_time_stemp)
  df['month'] = df['dt'].apply(lambda x:x.month)
  df['day'] = df['dt'].apply(lambda x:x.day)
  df['week'] = df['dt'].apply(lambda x:x.week)
  df['hour'] = df['dt'].apply(lambda x:x.hour)
  df['minute'] = df['dt'].apply(lambda x:x.minute)
  df['quarter'] = df['dt'].apply(lambda x:x.quarter)
  df['weekofyear'] = df['dt'].apply(lambda x:x.weekofyear)
  df['dayofweek'] = df['dt'].apply(lambda x:x.dayofweek)
  df['all_day_time'] = df['dt'].apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
  df['all_week_time'] = df['dt'].apply(lambda x:x.dayofweek * 24 + x.hour)
  df['all_time'] = df['dt'].apply(lambda x:x.dayofyear * 24 * 3600 + x.hour * 3600 + x.minute * 60 + x.second)
  df['all_month_time'] = df['dt'].apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
  df['hour_time'] = df['dt'].apply(lambda x:x.minute * 60 + x.second)
  df[f'sin_{col}'] = df[col].apply(sin)
  df[f'cos_{col}'] = df[col].apply(cos)
  df[f'sin_{col}^2']

In [22]:
train_pool = Pool(train_df.drop('label',axis=1),label=train_df['label'])
val_pool = Pool(val_df.drop('label',axis=1),label=val_df['label'])

In [23]:
params = {'iterations':800,
          'learning_rate':0.01,
          'loss_function':'CrossEntropy',
          'max_depth':4,
          'l2_leaf_reg':0.7,
          'eval_metric':'F1',
          'random_seed':56}
model = CatBoostClassifier(**params)
model.fit(train_pool,eval_set=val_pool,verbose=100)

0:	learn: 0.9900839	test: 0.9146341	best: 0.9146341 (0)	total: 19.2ms	remaining: 15.3s
100:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 1.07s	remaining: 7.39s
200:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 2.08s	remaining: 6.18s
300:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 3.06s	remaining: 5.08s
400:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 4.06s	remaining: 4.04s
500:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 5.09s	remaining: 3.04s
600:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 6.12s	remaining: 2.02s
700:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 7.12s	remaining: 1s
799:	learn: 1.0000000	test: 0.9283489	best: 0.9316770 (3)	total: 8.1s	remaining: 0us

bestTest = 0.9316770186
bestIteration = 3

Shrink model to first 4 iterations.


<catboost.core.CatBoostClassifier at 0x7f4bbf4ad370>

In [24]:
pred_features = trainer.predict(pl_model,dm.predict_dataloader())
pred_features = np.concatenate(pred_features,axis=1)[0].T
pred_df = pd.DataFrame()
pred_df['label'] = dm.test_df['label']

for i in range(len(pred_features)): 
    pred_df[f'feature_{i}'] = pred_features[i]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [24]:
pred_df['dt'] = dm.test_df['image'].apply(to_time_stemp)
pred_df = make_base_time_features(pred_df)

cols_for_pol = ['month','day','hour']
for col in cols_for_pol:
    get_polynoms_from_column(pred_df,col)

#pred_df = get_dop_features(pred_df)

  pred_df['dt'] = dm.test_df['image'].apply(to_time_stemp)
  df['month'] = df['dt'].apply(lambda x:x.month)
  df['day'] = df['dt'].apply(lambda x:x.day)
  df['week'] = df['dt'].apply(lambda x:x.week)
  df['hour'] = df['dt'].apply(lambda x:x.hour)
  df['minute'] = df['dt'].apply(lambda x:x.minute)
  df['quarter'] = df['dt'].apply(lambda x:x.quarter)
  df['weekofyear'] = df['dt'].apply(lambda x:x.weekofyear)
  df['dayofweek'] = df['dt'].apply(lambda x:x.dayofweek)
  df['all_day_time'] = df['dt'].apply(lambda x:x.hour * 3600 + x.minute * 60 + x.second)
  df['all_week_time'] = df['dt'].apply(lambda x:x.dayofweek * 24 + x.hour)
  df['all_time'] = df['dt'].apply(lambda x:x.dayofyear * 24 * 3600 + x.hour * 3600 + x.minute * 60 + x.second)
  df['all_month_time'] = df['dt'].apply(lambda x:x.day * 24 + x.hour + x.minute / 60)
  df['hour_time'] = df['dt'].apply(lambda x:x.minute * 60 + x.second)
  df[f'sin_{col}'] = df[col].apply(sin)
  df[f'cos_{col}'] = df[col].apply(cos)
  df[f'sin_{col}^2'] =

In [26]:
sub = pd.read_csv('/notebooks/simple_subv30.csv')
sub['class'] = model.predict(pred_df.drop(['label'],axis=1)).reshape(-1)

In [27]:
sub

Unnamed: 0,filename,class
0,20200520_194614_01_JPG.rf.cd7d1a3b71f5c71babf2...,0
1,20200520_194657_01_JPG.rf.88149097ed462c6cd423...,0
2,20200520_214839_01_JPG.rf.b4589905cfeaccf5354f...,0
3,20200522_075847_01_JPG.rf.91cd06b18d36e0ecaf57...,1
4,20200522_085904_01_JPG.rf.c21c0c7f4f27a3dee4ab...,1
...,...,...
899,20201009_102214_01_JPG.rf.14bce35ff1d937a2919f...,1
900,20201010_102926_01_JPG.rf.6a8128c4f22ed44c73fb...,1
901,20201010_133018_01_JPG.rf.65d57113ca7b086998f2...,1
902,20201010_193212_01_JPG.rf.e91b7b76f18119302657...,1


In [28]:
sub.to_csv('preds_19.csv',index=False)

In [41]:
### RANDOM FORESR PROB
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(metric='cosine')                             
model.fit(train_df.drop('label',axis=1),train_df['label'])
y_p = model.predict(val_df.drop('label',axis=1)).reshape(-1)
f1_score(val_df['label'],y_p)

0.9299363057324841

In [39]:
val_df

Unnamed: 0,label,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,day_sin^2,day_cos^2,sin_hour,cos_hour,sin_hour^2,cos_hour^2,hour_sin,hour_cos,hour_sin^2,hour_cos^2
150,1,-1.942400,-0.090518,-1.088050,-1.393681,2.249034,0.608549,-1.906063,0.751481,1.259681,...,0.235518,0.764482,0.656987,0.753902,0.431631,0.568369,0.755750,0.654861,0.571157,0.428843
556,1,-2.768139,-0.159800,-1.457292,-1.904798,2.585121,1.328370,-2.300955,1.314742,1.892407,...,0.424286,0.575714,0.989358,-0.145500,0.978830,0.021170,0.909632,0.415415,0.827430,0.172570
273,1,-2.620346,0.019366,-1.359550,-1.699392,2.463940,1.547214,-2.271470,1.498411,1.784108,...,0.977070,0.022930,-0.958924,0.283662,0.919536,0.080464,0.281733,0.959493,0.079373,0.920627
502,1,-2.333223,-0.205717,-1.258998,-1.730341,2.401423,0.832059,-2.162032,1.018466,1.617459,...,0.424286,0.575714,-0.287903,-0.957659,0.082888,0.917112,-0.281733,-0.959493,0.079373,0.920627
6,1,-2.045914,-0.239880,-1.102641,-1.598928,2.202447,0.588213,-1.918903,0.869544,1.295894,...,0.977070,0.022930,0.656987,0.753902,0.431631,0.568369,0.755750,0.654861,0.571157,0.428843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,0,-2.549015,0.994747,-1.300912,-1.393850,2.048077,1.872280,-1.562316,1.670339,1.398343,...,0.040521,0.959479,0.149877,0.988705,0.022463,0.977537,-0.909632,-0.415415,0.827430,0.172570
927,0,-2.610432,-0.134971,-1.376505,-1.727582,2.560034,1.090055,-2.307785,1.180537,1.608335,...,0.879379,0.120621,-0.958924,0.283662,0.919536,0.080464,0.281733,0.959493,0.079373,0.920627
90,1,-1.563682,-0.334294,-0.788366,-1.221293,1.811331,0.444266,-1.717241,0.601847,1.109403,...,0.326347,0.673653,0.412118,-0.911130,0.169842,0.830158,0.989821,0.142315,0.979746,0.020254
340,1,-1.934112,-0.158853,-0.990914,-1.377731,2.098378,0.518888,-1.812281,0.867932,1.267707,...,0.937173,0.062827,-0.958924,0.283662,0.919536,0.080464,0.281733,0.959493,0.079373,0.920627
