# Tackling the error
**Problems:**
- There are 7 missing images
- There are a corrupt images

### Import the needed packages

In [1]:
# handling files
import os
# handling metadata
import pandas as pd
# handling images
from PIL import Image
# handling the environment variables
from dotenv import load_dotenv
# handling the python run
import subprocess as sp
# handling the pandas warnings
pd.options.mode.copy_on_write = True
# loading the environment variables
load_dotenv()

True

### Global variable

In [2]:
path_full = os.environ.get("ORI_PATH")
dataset_path = os.path.join(path_full, 'datasets/preprocessed')
metadata = pd.read_csv(os.path.join(path_full, "data/refactored_metadata.csv"))
src_metadata = pd.read_csv(os.path.join(path_full, "data/raw_metadata.csv"))

### Analyse the data

In [3]:
metadata.drop(columns=["new_path"])

Unnamed: 0,id,file_name
0,64992,fff_1_64992_r_n.jpg
1,64992,fff_1_64992_l_n.jpg
2,63298,fff_1_63298_r_1.jpg
3,63298,fff_1_63298_r_3.jpg
4,63298,fff_1_63298_r_2.jpg
...,...,...
560,121355,oct_0_121355_l_).jpg
561,121355,oct_0_121355_l_).jpg
562,121856,oct_0_121856_u_y.jpg
563,48763,oct_0_48763_r_d.jpg


In [4]:
# isolate the real data by its file name
real_data = pd.DataFrame(columns=['file_name'])

In [5]:
# iterate through the dataset and add the file names founded to the real_data dataframe
for img_type in ['fundus_image', 'oct_image']:
    for label in ['glaucoma', 'non_glaucoma']:
        for filename in os.listdir(os.path.join(dataset_path, img_type, label)):
            real_data.loc[len(real_data)] = [filename]

real_data['id'] = real_data['file_name'].apply(lambda x: x.split('_')[2])
real_data.reset_index(names='index', inplace=True)
real_data

Unnamed: 0,index,file_name,id
0,0,fff_1_100287_l_).jpg,100287
1,1,fff_1_100287_r_).jpg,100287
2,2,fff_1_10207_l_1.jpg,10207
3,3,fff_1_10207_l_2.jpg,10207
4,4,fff_1_10207_r_1.jpg,10207
...,...,...,...
553,553,oct_0_83528_r_l.jpg,83528
554,554,oct_0_84739_r_s.jpg,84739
555,555,oct_0_87588_r_s.jpg,87588
556,556,oct_0_89703_r_a.jpg,89703


#### Search a corrupt image

In [6]:
corrupt_files = []
for index, row in metadata.iterrows():
    try:
        # check if the file is exist and is an image
        with Image.open(os.path.join(path_full, row.new_path)) as img:
            img.verify()
        # check if the image is complete
        with open(os.path.join(path_full, row.new_path), 'rb') as img:
            if img.read()[-2:] != b'\xff\xd9':
                print(f"image not complete error : {row.file_name}")
                corrupt_files.append(row.file_name)
    except Exception as e:
        print(f"Raised an exception: {e}",
                f"file_name: {row.file_name}",
                sep="\n")
        corrupt_files.append(row.file_name)

image not complete error : fff_0_122451_l_1.jpg


### Comparing information from metadata and real image file

In [7]:
validated_data = metadata.merge(real_data, on='file_name', how='left')
validated_data.drop(columns=["new_path"])

Unnamed: 0,id_x,file_name,index,id_y
0,64992,fff_1_64992_r_n.jpg,141,64992
1,64992,fff_1_64992_l_n.jpg,140,64992
2,63298,fff_1_63298_r_1.jpg,137,63298
3,63298,fff_1_63298_r_3.jpg,139,63298
4,63298,fff_1_63298_r_2.jpg,138,63298
...,...,...,...,...
560,121355,oct_0_121355_l_).jpg,506,121355
561,121355,oct_0_121355_l_).jpg,506,121355
562,121856,oct_0_121856_u_y.jpg,532,121856
563,48763,oct_0_48763_r_d.jpg,545,48763


### Identify the problem

In [8]:
miss_data = validated_data.loc[validated_data.duplicated()]
miss_data.drop(columns=['index', 'id_y'], inplace=True)
miss_data.rename(columns={'id_x': 'id', 'new_path':'path'}, inplace=True)
miss_data.reset_index(names='error_id', inplace=True)
miss_data['error_id'] = (miss_data['error_id'].astype('str') + '_'
                        + miss_data['file_name'].apply(lambda x: x.split('_')[0]) + '_'
                        + miss_data['file_name'].apply(lambda x: x.split('_')[1]))
miss_data['path'] = miss_data['path'].apply(lambda x: os.path.join('./../../', x))
miss_data.drop(columns=["path"])

Unnamed: 0,error_id,id,file_name
0,28_fff_1,120793,fff_1_120793_r_).jpg
1,29_fff_1,120793,fff_1_120793_r_).jpg
2,30_fff_1,120793,fff_1_120793_r_).jpg
3,31_fff_1,120793,fff_1_120793_r_).jpg
4,33_fff_1,120793,fff_1_120793_l_).jpg
5,34_fff_1,120793,fff_1_120793_l_).jpg
6,561_oct_0,121355,oct_0_121355_l_).jpg


**Found the reason why there are 7 image missing:**
There are 7 duplicated data

### Start Solving The Problem

#### Solve 553_oct_0

In [9]:
miss_553_oct_0 = src_metadata.loc[(src_metadata.id == 121355)
                                    & (src_metadata.img_type == 'oct')]
miss_553_oct_0 = list(miss_553_oct_0.path)
miss_553_oct_0 = [os.path.join(path_full, value) for value in miss_553_oct_0]

In [10]:
try:
    os.rename(miss_553_oct_0[0], miss_553_oct_0[0].replace('OS (121356)', 'OD (121355)'))
except FileNotFoundError:
    print('File not found or already renamed')

#### Solve 28_fff_1, 29_fff_1, 30_fff_1, 31_fff_1, 33_fff_1, 34_fff_1

In [11]:
miss_rest = {'src':list(src_metadata.loc[(src_metadata.id == 120793) & (src_metadata.img_type == 'fundus'), 'path'])}

miss_rest['new'] = [x.replace('3 (', '3 ') for x in miss_rest['src']]
miss_rest['new'] = [x.replace('3    (', '3    ') for x in miss_rest['new']]
miss_rest['new'] = [x.replace(').', '.') for x in miss_rest['new']]
miss_rest['new'] = [os.path.join(path_full, x) for x in miss_rest['new']]
miss_rest['src'] = [os.path.join(path_full, x) for x in miss_rest['src']]

In [12]:
for srcfile, newfile in zip(miss_rest['src'], miss_rest['new']):
    try:
        os.rename(srcfile, newfile)
    except FileNotFoundError:
        print(f'{srcfile} not found or already renamed')

### Rerun The Scripts
* a. create_metadata.py
* c. refactor_dataset.py

In [13]:
sp.run(["python", "b. create_metadata.py"], shell=True, capture_output=True)

CompletedProcess(args=['python', 'b. create_metadata.py'], returncode=0, stdout=b'completed create metadata.\r\n', stderr=b'Corrupt JPEG data: premature end of data segment\r\n')

In [14]:
sp.run(["python", "c. refactor_dataset.py"], shell=True, capture_output=True)

CompletedProcess(args=['python', 'c. refactor_dataset.py'], returncode=0, stdout=b'completed refactoring dataset.\r\n', stderr=b'')

#### Solve the corrupt image

In [15]:
corrupt_file = corrupt_files[0].split(".")[0]
for dir_img_type in os.listdir(dataset_path):
    for dir_label in os.listdir(os.path.join(dataset_path, dir_img_type)):
        for file in os.listdir(os.path.join(dataset_path, dir_img_type, dir_label)):
            if file.split(".")[0] == corrupt_file or file.split('.')[0] == f"{corrupt_file}_mask":
                try:
                    os.remove(os.path.join(dataset_path, dir_img_type, dir_label, file))
                    print(f"Deleted {file}")
                except Exception as e:
                    print(f"Error: {e}")

Deleted fff_0_122451_l_1.jpg
