In [None]:
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
import numpy as np
import os

from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [None]:
!unzip /content/gdrive/MyDrive/had.zip

In [None]:
video_labels = pd.read_csv('/content/second_dataset_names.csv')
video_labels

Unnamed: 0,image_filename,label
0,V_HELICOPTER_0611_072.png,1
1,V_HELICOPTER_0611_086.png,1
2,V_HELICOPTER_0611_096.png,1
3,V_HELICOPTER_0611_097.png,1
4,V_HELICOPTER_0611_131.png,1
...,...,...
22495,two_uavs_plus_airplane_510.png,753
22496,two_uavs_plus_airplane_522.png,753
22497,two_uavs_plus_airplane_59.png,753
22498,two_uavs_plus_airplane_603.png,753


In [None]:
base_path = '/content/hackathon_additional_dataset'
images_path = os.path.join(base_path, 'images')
labels_path = os.path.join(base_path, 'labels')

In [None]:
images_path, labels_path

('/content/hackathon_additional_dataset/images',
 '/content/hackathon_additional_dataset/labels')

In [None]:
image_names = video_labels.image_filename.values
image_names

array(['V_HELICOPTER_0611_072.png', 'V_HELICOPTER_0611_086.png',
       'V_HELICOPTER_0611_096.png', ..., 'two_uavs_plus_airplane_59.png',
       'two_uavs_plus_airplane_603.png', 'two_uavs_plus_airplane_646.png'],
      dtype=object)

In [None]:
from tqdm.auto import tqdm

object_labels = []
for image_name in tqdm(image_names):
   # print('image_name:', image_name)
    label_path = '.'.join(image_name.split('.')[:-1]) + '.txt'
    label_abs_path = os.path.join(labels_path, label_path)
    with open(label_abs_path, 'r') as f:
        content = f.read()
   # print('content:', content)
    label = int(content[0])
    object_labels.append(label)

  0%|          | 0/22500 [00:00<?, ?it/s]

In [None]:
object_labels

In [None]:
video_labels['object_label'] = object_labels

In [None]:
video_labels['object_label'].value_counts()

object_label
0    5000
1    5000
3    5000
4    5000
2    2500
Name: count, dtype: int64

In [None]:
video_labels

Unnamed: 0,image_filename,label,object_label
0,V_HELICOPTER_0611_072.png,1,2
1,V_HELICOPTER_0611_086.png,1,2
2,V_HELICOPTER_0611_096.png,1,2
3,V_HELICOPTER_0611_097.png,1,2
4,V_HELICOPTER_0611_131.png,1,2
...,...,...,...
22495,two_uavs_plus_airplane_510.png,753,0
22496,two_uavs_plus_airplane_522.png,753,0
22497,two_uavs_plus_airplane_59.png,753,0
22498,two_uavs_plus_airplane_603.png,753,0


In [None]:
X = video_labels.image_filename.values
y = video_labels.object_label.values
groups = video_labels.label.values

In [None]:
sgkf = StratifiedGroupKFold(n_splits=4, shuffle=False)
test_folds = []
for i, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=groups)):
    test_folds.append(test_idx)
    if i == 1:
        break

In [None]:
test_folds

[array([   62,    63,    70, ..., 22001, 22002, 22003]),
 array([    0,     1,     2, ..., 22497, 22498, 22499])]

In [None]:
len(test_folds[0]), len(test_folds[1])

(5625, 5625)

In [None]:
val_idx = test_folds[0]
test_idx = test_folds[1]

In [None]:
train_df = video_labels[~video_labels.index.isin(val_idx) & ~video_labels.index.isin(test_idx)]
val_df = video_labels.iloc[val_idx]
test_df = video_labels.iloc[test_idx]

In [None]:
train_df['object_label'].value_counts()

object_label
0    2500
1    2500
3    2500
4    2500
2    1250
Name: count, dtype: int64

In [None]:
val_df['object_label'].value_counts()

object_label
0    1250
1    1250
3    1250
4    1250
2     625
Name: count, dtype: int64

In [None]:
test_df['object_label'].value_counts()

object_label
0    1250
1    1250
3    1250
4    1250
2     625
Name: count, dtype: int64

In [None]:
train_df['label'].nunique(), val_df['label'].nunique(), test_df['label'].nunique()

(377, 188, 188)

In [None]:
sum([train_label in val_df['video_label'].values for train_label in train_df['video_label'].values])

0

In [None]:
video_labels[video_labels['label'] == 5]

Unnamed: 0,image_filename,label,object_label
62,V_HELICOPTER_0531_101.png,5,2
63,V_HELICOPTER_0531_248.png,5,2


In [None]:
for df in [train_df, val_df, test_df]:
    df.rename(columns={'label': 'video_label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'label': 'video_label'}, inplace=True)


In [None]:
test_df

Unnamed: 0,image_filename,video_label,object_label
0,V_HELICOPTER_0611_072.png,1,2
1,V_HELICOPTER_0611_086.png,1,2
2,V_HELICOPTER_0611_096.png,1,2
3,V_HELICOPTER_0611_097.png,1,2
4,V_HELICOPTER_0611_131.png,1,2
...,...,...,...
22495,two_uavs_plus_airplane_510.png,753,0
22496,two_uavs_plus_airplane_522.png,753,0
22497,two_uavs_plus_airplane_59.png,753,0
22498,two_uavs_plus_airplane_603.png,753,0


In [None]:
train_df.to_csv('second_dataset_train.csv', index=False)
val_df.to_csv('second_dataset_val.csv', index=False)
test_df.to_csv('second_dataset_test.csv', index=False)