In [10]:
import pandas as pd

In [11]:
all_download_small_images = pd.read_csv(
    "all_small_images.txt", header=None, names=["image"]
)
all_download_small_images

Unnamed: 0,image
0,a0000001-2.png
1,a0000002-2.png
2,a0000003-2.png
3,a0000004-2.png
4,a0000005-2.png
...,...
189840,b0547524-3.png
189841,b0547524-4.png
189842,b0547525-1.png
189843,b0547525-2.png


In [12]:
raw_mover_image_mapping = pd.read_csv("mover_to_image_mapping.csv")
raw_mover_image_mapping

Unnamed: 0,totas_id,image
0,504421,b0504421-2.png
1,504421,b0504421-3.png
2,504422,b0504422-1.png
3,504422,b0504422-2.png
4,504423,b0504423-1.png
...,...,...
154963,547524,b0547524-3.png
154964,547524,b0547524-2.png
154965,547525,b0547525-4.png
154966,547525,b0547525-2.png


In [20]:
training_data = pd.read_csv("../../data/csv/all_movers.csv")
training_data

Unnamed: 0,mover_id,file_name,label
0,TO18060,b0545775-1.png,1
1,TO18060,b0545775-2.png,1
2,TO18060,b0545775-3.png,1
3,TO18060,b0545775-4.png,1
4,TO18059,b0545771-1.png,1
...,...,...,...
77931,M322883,b0524086-4.png,0
77932,M322882,b0524085-1.png,0
77933,M322882,b0524085-2.png,0
77934,M322882,b0524085-3.png,0


In [13]:
# First let's filter for totas_ids that appear only four times
mover_image_mapping = raw_mover_image_mapping.groupby("totas_id").filter(
    lambda x: len(x) == 4
)
mover_image_mapping

Unnamed: 0,totas_id,image
98,504481,b0504481-1.png
99,504481,b0504481-2.png
100,504481,b0504481-4.png
101,504481,b0504481-3.png
102,504482,b0504482-1.png
...,...,...
154773,547447,b0547447-3.png
154774,547448,b0547448-4.png
154775,547448,b0547448-1.png
154776,547448,b0547448-2.png


In [14]:
print(f"Number of movers: {int(len(mover_image_mapping) / 4):_}")

Number of movers: 32_224


In [16]:
test_images_raw = mover_image_mapping["image"]
test_images_raw

98        b0504481-1.png
99        b0504481-2.png
100       b0504481-4.png
101       b0504481-3.png
102       b0504482-1.png
               ...      
154773    b0547447-3.png
154774    b0547448-4.png
154775    b0547448-1.png
154776    b0547448-2.png
154777    b0547448-3.png
Name: image, Length: 128896, dtype: object

In [23]:
test_images_raw = set(test_images_raw)
print(f"Lenght of test images: {len(test_images_raw):_}")

Lenght of test images: 128_896


In [22]:
already_downloaded_images = set(all_download_small_images["image"])
print(f"Lenght of already downloaded images: {len(already_downloaded_images):_}")

Lenght of already downloaded images: 189_845


In [24]:
test_images_already_downloaded = test_images_raw.intersection(already_downloaded_images)
print(
    f"Length of test images already downloaded: {len(test_images_already_downloaded):_}"
)

Length of test images already downloaded: 128_896


In [25]:
mover_image_mapping

Unnamed: 0,totas_id,image
98,504481,b0504481-1.png
99,504481,b0504481-2.png
100,504481,b0504481-4.png
101,504481,b0504481-3.png
102,504482,b0504482-1.png
...,...,...
154773,547447,b0547447-3.png
154774,547448,b0547448-4.png
154775,547448,b0547448-1.png
154776,547448,b0547448-2.png


In [26]:
totas_id_to_mover = pd.read_csv("movers_2021-2024_cleaned.txt")
totas_id_to_mover

Unnamed: 0,totas_id,mover_id,label
0,504421,TKCCA25,1
1,504422,TKCCA26,1
2,504423,TKCCA27,1
3,504424,TKCCA28,1
4,504425,TKCCA29,1
...,...,...,...
42935,547521,TO2C148,1
42936,547522,TO2C149,1
42937,547523,TO2C150,1
42938,547524,TO2C151,1


In [27]:
# Now lets get the mover ids of the mover_image_mapping
mover_image_mapping = mover_image_mapping.merge(totas_id_to_mover, on="totas_id")
mover_image_mapping

Unnamed: 0,totas_id,image,mover_id,label
0,504481,b0504481-1.png,TL28001,1
1,504481,b0504481-2.png,TL28001,1
2,504481,b0504481-4.png,TL28001,1
3,504481,b0504481-3.png,TL28001,1
4,504482,b0504482-1.png,OG21023,1
...,...,...,...,...
128891,547447,b0547447-3.png,M332757,0
128892,547448,b0547448-4.png,TO2C076,1
128893,547448,b0547448-1.png,TO2C076,1
128894,547448,b0547448-2.png,TO2C076,1


In [42]:
test_data_movers = set(mover_image_mapping["mover_id"])
print(f"Length of test data images: {len(test_data_movers):_}")

Length of test data images: 32_224


In [43]:
training_data_movers = set(training_data["mover_id"])
print(f"Length of training data images: {len(training_data_movers):_}")

Length of training data images: 19_484


In [44]:
test_data_movers_already_downloaded = test_data_movers.intersection(
    training_data_movers
)
print(
    f"Length of test data movers already downloaded: {len(test_data_movers_already_downloaded):_}"
)

Length of test data movers already downloaded: 19_482


In [45]:
# Make the test data of movers that are not in the test_data_movers_already_downloaded
test_data_movers_to_download = test_data_movers - test_data_movers_already_downloaded
print(f"Length of test data movers to download: {len(test_data_movers_to_download):_}")

Length of test data movers to download: 12_742


In [46]:
test_data_set = mover_image_mapping[
    mover_image_mapping["mover_id"].isin(test_data_movers_to_download)
]
test_data_set

Unnamed: 0,totas_id,image,mover_id,label
0,504481,b0504481-1.png,TL28001,1
1,504481,b0504481-2.png,TL28001,1
2,504481,b0504481-4.png,TL28001,1
3,504481,b0504481-3.png,TL28001,1
4,504482,b0504482-1.png,OG21023,1
...,...,...,...,...
128871,547442,b0547442-4.png,TO2C075,1
128892,547448,b0547448-4.png,TO2C076,1
128893,547448,b0547448-1.png,TO2C076,1
128894,547448,b0547448-2.png,TO2C076,1


In [47]:
test_data_set.to_csv("test_data_set.csv", index=False)

In [50]:
test_data_set["totas_id"].unique()

array([504481, 504482, 504483, ..., 547432, 547442, 547448])

In [51]:
test_data_set["mover_id"].unique()

array(['TL28001', 'OG21023', 'TL28002', ..., 'TO2C074', 'TO2C075',
       'TO2C076'], dtype=object)