In [39]:
import pandas as pd
import numpy as np
import os, shutil

def save_label_to_csv(save_file, df):
    with open(save_file, 'w', encoding='utf-8') as f:
        for image_path, labels in df.values:
            f.write(image_path + '\t')
            f.write(str(labels) + '\n')


def move_file(source_folder, destination_folder, file_name):
    # Check if the source file exists
    source_path = os.path.join(source_folder, file_name)
    if not os.path.exists(source_path):
        print(f"Source: {source_path}, file '{file_name}' not found in '{source_folder}'.")
        return
    
    # Check if the destination folder exists
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)  # Create the destination folder if it doesn't exist
    
    # Construct the destination path
    destination_path = os.path.join(destination_folder, file_name)
    
    try:
        shutil.move(source_path, destination_path)
        print(f"File '{file_name}' moved from '{source_folder}' to '{destination_folder}'.")
    except Exception as e:
        print(f"Failed to move the file: {e}")

In [22]:
df_train = pd.read_csv("/work/quang.domanh/datasets/invoices/bank/TrainLabel_invoices.txt", header=None, sep="\t")
df_val = pd.read_csv("/work/quang.domanh/datasets/invoices/bank/ValLabel_invoices.txt", header=None, sep="\t")
data_train = df_train.values
data_val = df_val.values

In [26]:
np.random.shuffle(data_train)
np.random.shuffle(data_val)

data_test = np.concatenate((data_train[0:len(data_train)%1000], data_val[0:len(data_val)%1000]), axis=0)
len(data_test), len(data_val[0:len(data_val)%1000]), len(data_train[0:len(data_train)%1000])

(1000, 203, 797)

In [28]:
df_train_new = pd.DataFrame(np.array(data_train[len(data_train)%1000:]))
df_val_new = pd.DataFrame(np.array(data_val[len(data_val)%1000:]))
df_test_new = pd.DataFrame(np.array(data_test))

In [34]:
save_label_to_csv(save_file="/work/quang.domanh/datasets/invoices/bank/TrainLabel_invoices_new.txt", df=df_train_new)
df_train_new.head()

Unnamed: 0,0,1
0,train/88ef70eec9ecdfb4cb1786b84b76c50a.jpg,"[{""transcription"": ""21:25"", ""points"": [[36, 11..."
1,train/d9d1050d8dc369be0bf112b7a6dbbb0b.jpg,"[{""transcription"": ""18:45"", ""points"": [[16, 0]..."
2,train/aa6f3e8481e6c3eda335445e06336981.jpg,"[{""transcription"": ""18:22"", ""points"": [[26, 7]..."
3,train/4f35cc99d535f6ad27bdd59d85ee8012.jpg,"[{""transcription"": ""19:18"", ""points"": [[28, 10..."
4,train/b00bf2257a83d47dcfe28fd7c9a5a5ef.jpg,"[{""transcription"": ""21.01"", ""points"": [[140, 3..."


In [35]:
save_label_to_csv(save_file="/work/quang.domanh/datasets/invoices/bank/ValLabel_invoices_new.txt", df=df_val_new)
df_val_new

Unnamed: 0,0,1
0,val/debe3fc4f6fc46e0ea53a3911cf7bc11.jpg,"[{""transcription"": ""MSB"", ""points"": [[132, 14]..."
1,val/30b43fa13ea1500684e8f1167b0f2300.jpg,"[{""transcription"": ""4855"", ""points"": [[28, 8],..."
2,val/623b9ca426056fb936e86d30d51199dc.jpg,"[{""transcription"": ""CHUYENKHOAN THANH CONG"", ""..."
3,val/61715ca45eba4b1a777c8d075a56228a.jpg,"[{""transcription"": ""Hoa don"", ""points"": [[139,..."
4,val/9fe64d9d4c2658230d7c2e39b556a741.jpg,"[{""transcription"": ""18:40"", ""points"": [[47, 9]..."
...,...,...
995,val/45f707ce651de5e0e0971e8d56b10a18.jpg,"[{""transcription"": ""19.16"", ""points"": [[27, 6]..."
996,val/f1a0263eaf9865d6965019401b674bfa.jpg,"[{""transcription"": ""1613"", ""points"": [[1, 5], ..."
997,val/79d0f2e00b29b22a6692bacb3c3f9ce9.jpg,"[{""transcription"": ""MSB"", ""points"": [[183, 0],..."
998,val/0492f1ce69cd5b2526433daaa64ae60e.jpg,"[{""transcription"": ""9:20"", ""points"": [[38, 15]..."


In [36]:
save_label_to_csv(save_file="/work/quang.domanh/datasets/invoices/bank/TestLabel_invoices_new.txt", df=df_test_new)
df_test_new

Unnamed: 0,0,1
0,train/5027bb64470e2d403d14a24acf2f672d.jpg,"[{""transcription"": ""ch.P"", ""points"": [[48, 21]..."
1,train/9726ddf07ccb723c70f22cf9f2e14800.jpg,"[{""transcription"": ""Giao dich thanh cong!"", ""p..."
2,train/c9c4dcf1ec5711cd965a0d18cedea91d.jpg,"[{""transcription"": ""Chi tiet giao djch"", ""poin..."
3,train/a7202b41f84e14685c0627218dd386cf.jpg,"[{""transcription"": ""MSB"", ""points"": [[176, 11]..."
4,train/18b77416aff1820dbd5c56f604cea85d.jpg,"[{""transcription"": ""19:32"", ""points"": [[19, 8]..."
...,...,...
995,val/e45275574e0ab2dd448aa608d5dd17d2.jpg,"[{""transcription"": ""17:10"", ""points"": [[24, 13..."
996,val/9df5d07aef46b7fe6dd802d90dab348a.jpg,"[{""transcription"": ""7"", ""points"": [[13, 0], [4..."
997,val/ef0241ffa1861a5443a00043ea3434f0.jpg,"[{""transcription"": ""13:30"", ""points"": [[11, 8]..."
998,val/0ed67c4cc895171a68904cc0fb512f1d.jpg,"[{""transcription"": ""VCBDigibank"", ""points"": [[..."


In [37]:
df_test = pd.read_csv("/work/quang.domanh/datasets/invoices/bank/TestLabel_invoices_new.txt", header=None, sep="\t")
df_test.head()

Unnamed: 0,0,1
0,train/5027bb64470e2d403d14a24acf2f672d.jpg,"[{""transcription"": ""ch.P"", ""points"": [[48, 21]..."
1,train/9726ddf07ccb723c70f22cf9f2e14800.jpg,"[{""transcription"": ""Giao dich thanh cong!"", ""p..."
2,train/c9c4dcf1ec5711cd965a0d18cedea91d.jpg,"[{""transcription"": ""Chi tiet giao djch"", ""poin..."
3,train/a7202b41f84e14685c0627218dd386cf.jpg,"[{""transcription"": ""MSB"", ""points"": [[176, 11]..."
4,train/18b77416aff1820dbd5c56f604cea85d.jpg,"[{""transcription"": ""19:32"", ""points"": [[19, 8]..."


In [None]:
dir = "/work/quang.domanh/datasets/invoices/bank"
for file in df_test[0].values:
    # print(file)
    # print(os.path.dirname(file))
    move_file(source_folder=os.path.join(dir, os.path.dirname(file)), 
              destination_folder=os.path.join(dir, "test"), 
              file_name=os.path.basename(file))