# Split original train set into new train and validation set and convert .tsv to .csv

In [1]:
# Import packages
import pandas as pd

import os
import shutil
import random
import tqdm

from sklearn.model_selection import train_test_split

## Train split

In [None]:
work_dir_path = './'

In [2]:
# Data paths
train_images_path = f'{work_dir_path}data/orginal_set/train/'
train_labels_path = f'{work_dir_path}data/orginal_set/train.tsv'

val_images_path = f'{work_dir_path}data/train_val_split_csv/val/'
val_labels_path = f'{work_dir_path}data/train_val_split_csv/val.csv'

new_train_images_path = f'{work_dir_path}data/train_val_split_csv/train/'
new_train_labels_path = f'{work_dir_path}data/train_val_split_csv/train.csv'

In [5]:
# Set seed 
random.seed(1234)

In [6]:
# Read orgiginal train labels tsv file
df = pd.read_csv(train_labels_path, sep='\t', header=None, names=['image', 'label'])

In [7]:
df.head()

Unnamed: 0,image,label
0,aa1.png,Молдова
1,aa1007.png,продолжила борьбу
2,aa101.png,разработанные
3,aa1012.png,Плачи
4,aa1013.png,Гимны богам


In [16]:
print(f'Original train set size is {df.shape}')

(72286, 2)

In [None]:
# Use 2.2% of data for validation
p = 0.022

In [18]:
train_df, val_df = train_test_split(df, test_size=p, random_state=1234)

In [19]:
print(f'New validation set size is {train_df.shape}')
print(f'New validation set size is {val_df.shape}')

(1591, 2)

In [22]:
# Iterate through each row of the new train dataframe
for index, row in tqdm.tqdm(train_df.iterrows()):

    # Get the image name from the current row
    image_name = row['image']  

    # Construct the full source path 
    src = os.path.join(train_images_path, image_name)

    # Construct the full destination path
    dst = os.path.join(new_train_images_path, image_name)

    # Copy the image file from source to destination
    shutil.copy(src, dst)

# Iterate through each row of the validation dataframe  
for index, row in tqdm.tqdm(val_df.iterrows()):

    # Get the image name from the current row
    image_name = row['image']

    # Construct the full source path
    src = os.path.join(train_images_path, image_name)

    # Construct the full destination path 
    dst = os.path.join(val_images_path, image_name)

    # Copy the image file from source to destination
    shutil.copy(src, dst)

70695it [03:29, 337.93it/s]
1591it [00:04, 358.50it/s]


In [25]:
# Save new files as .csv
train_df.to_csv(new_train_labels_path, index=False)
val_df.to_csv(val_labels_path, index=False)

## Convert test label file to .csv

In [34]:
# Test data paths
test_labels_path = './data/orginal_set/test.tsv'
new_test_labels_path = './data/train_val_split_csv/test.csv'

In [29]:
# Read file
test_df = pd.read_csv(test_labels_path, sep='\t', header=None, names=['image', 'label'])

In [None]:
print(f'Test set size is {test_df.shape}')

In [33]:
# Save it as .csv
test_df.to_csv(new_test_labels_path, index=False)