### Image extraction from tar files

1. Extract images from the tar files
2. Rename the images to their respective "Roco Names"
3. Transfer the images to the "images" folder
4. Delete all temporary directories used for the image extractions




**- Note: The tar files were downloaded via comand line.**

- Also, `extract_images.py` was ran in command line to perform the above mentioned functions

In [1]:
import os
import pandas as pd
import numpy as np
import re
from os import walk
import math
from collections import Counter

import shutil
import sys
import tarfile

In [2]:
###know the current directory
os.getcwd()

'/home/ec2-user/SageMaker'

In [3]:
os.listdir('roco-dataset/data/train/radiology')

['.ipynb_checkpoints',
 'roco_data',
 'captions.txt',
 'ext_list.txt',
 'semtypes.txt',
 'dlinks.txt',
 'licences.txt',
 'images',
 'url_list.txt',
 'cuis.txt',
 'keywords.txt']

In [4]:
train_download_links = 'roco-dataset/data/train/radiology/dlinks.txt'
test_download_links = 'roco-dataset/data/test/radiology/dlinks.txt'
val_download_links = 'roco-dataset/data/validation/radiology/dlinks.txt'

### Train Dataset --- get the url_links for the training dataset

In [5]:
df_train = pd.read_csv(train_download_links, delimiter = '\t', header = None)

In [6]:
df_train.head()

Unnamed: 0,0,1,2
0,ROCO_00002,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,AMHSR-4-14-g002.jpg
1,ROCO_00003,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,IJD2009-150251.001.jpg
2,ROCO_00004,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,11999_2007_30_Fig6_HTML.jpg
3,ROCO_00005,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,IJD2013-683423.005.jpg
4,ROCO_00007,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,amjcaserep-17-301-g001.jpg


In [7]:
df_train.shape

(65450, 3)

In [8]:
df_train.isnull().sum()

0    0
1    0
2    0
dtype: int64

In [9]:
df_train['url_list'] = df_train[1].apply(lambda x: x.split()[2])
df_train['pmc_id'] = df_train['url_list'].apply(lambda x: x.split('/')[-1][:-7])
df_train['img'] = df_train['pmc_id'] + '/' + df_train[2]

In [10]:
df_train.sort_values(by = 'pmc_id', inplace = True)

In [11]:
df_train.reset_index(inplace = True)

In [12]:
df_train.drop(['index'], axis = 1, inplace = True)

In [13]:
df_train.head()

Unnamed: 0,0,1,2,url_list,pmc_id,img
0,ROCO_35947,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1471-2482-2-1-4.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC101389,PMC101389/1471-2482-2-1-4.jpg
1,ROCO_38384,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,neh055f1.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1062148,PMC1062148/neh055f1.jpg
2,ROCO_56680,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cc2926-3.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1065025,PMC1065025/cc2926-3.jpg
3,ROCO_03822,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cc2926-1.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1065025,PMC1065025/cc2926-1.jpg
4,ROCO_09480,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cc2940-1.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1065094,PMC1065094/cc2940-1.jpg


In [215]:
###os.chdir('/home/ec2-user/SageMaker')

In [244]:
###Get your url_list for downloads
with open('ext_list.txt', 'w') as file_handler:
    for item in df_train['url_list'].values.tolist():
        #print(item)
        file_handler.write("{}\n".format(item))

In [120]:
####THIS FUNCTION IS == extract_images.py ---- IT IS RECOMMENDED TO BE RUN THROUGH COMMAND LINE
def extract_images(dir_links):
    
    """
    This function perform the following tasks:
    -- Extract images from the tar files
    -- Rename the images to their respective "Roco Names"
    -- Transfer the images to the "images" folder
    -- Delete all temporary directories used for the image extractions
    """
    
    ##prepare the dataframe that contains the pmc_id column to be used to tar extraction
    df = pd.read_csv(dir_links, delimiter = '\t', header = None)
    df['url_list'] = df[1].apply(lambda x: x.split()[2])
    df['pmc_id'] = df['url_list'].apply(lambda x: x.split('/')[-1][:-7])
    df['img'] = df['pmc_id'] + '/' + df[2]
    df.sort_values(by = 'pmc_id', inplace = True)
    df.reset_index(inplace = True)
    df.drop(['index'], axis = 1, inplace = True)
    
    ##get the roco_data -- tar files and sort it
    roco_data = os.listdir('roco_data')
    roco_data.sort()
    
    
    ##function used to remove non-downloadable  or non-extracted files from the df
    def remove_missing_data(df, column, roco_list, id):
        
        pmc_id = df[column].values.tolist()
        
        def missing_index(pmc_id, roco_list, id):
            idx_missing = []
            for idx, x in enumerate(pmc_id[:-id]):
                if x != roco_list[idx]:
                    idx_missing.append(idx)
            return idx_missing, pmc_id
        
        for x in range(id):
            miss_id, pmc_id = missing_index(pmc_id, roco_list, id)
            id_miss = miss_id[0]
            del pmc_id[id_miss]
            df.drop([id_miss], inplace = True)
            df.reset_index(inplace = True)
            df.drop(['index'], axis = 1, inplace = True)
            id -= 1
        return df
    
    ##check for missing ids and remove them from the dataframe
    roco_list = [re.sub(r'\.tar.*', '', x) for x in roco_data]
    id = df.shape[0] - len(roco_data)
    df = remove_missing_data(df, 'pmc_id', roco_list, id)
    
    
    ##extract_tarfiles dir created
    os.mkdir('extract_tarfiles')
    
    ##Extract files with the specific images needed from the downloaded tar files
    idx_list = [] ##index of the tar files that do not contain images to be extracted
    ##extract images from the downloaded tar files
    for idx, x in enumerate(roco_data):
        image = df['img'][idx]
        archive_tarfile = tarfile.open(os.path.join('./roco_data', x))
        try:
            archive_tarfile.extract(image, 'extract_tarfiles')
            print(image, x)
        except:
            print('Could not extract images from {0}...'.format(x))
            idx_list.append(idx)
            continue
    
    ##drop the tar files in df that do not contain images to be extracted
    df.drop(idx_list, inplace = True)
    df.reset_index(inplace = True)
    df.drop(['index'], axis = 1, inplace = True)
    
    ##delete tar files in roco_data dir that do not contain images to be extracted
    for idx in idx_list:
        print('tar file to delete: {0}'.format(roco_data[idx]))
        os.unlink('roco_data/' + roco_data[idx])
        
    print('\n Amount of files, where images were successfully extracted from {0}'.format(df.shape[0]))
    
    ##load the extracted images into a specific folder -- "images_folder2"
    os.mkdir('images_folder2')
    for ext_file in os.listdir('./extract_tarfiles'):
        for images in os.listdir('./extract_tarfiles/' + ext_file):
            imgs = os.path.join('./extract_tarfiles/' + ext_file, images)
            shutil.copy(imgs, 'images_folder2')
            #print(imgs)
    print('\n Finished loading images into images_folder2 \n')
    
    img_list = os.listdir('images_folder2')
    print('\n Amount of images successfully extracted == {0} \n'.format(len(img_list)))
    
    ##amount of missing images after extraction 
    id_img = df.shape[0] - len(img_list)
    
    ###lets remove missing images from the df_tr
    df = remove_missing_data(df, 2, img_list, id_img)
    
    roco_names = df[0].values.tolist()
    ##rename the images to their "ROCO" names
    for idx, images in enumerate(img_list):
        print(images)
        os.rename('./images_folder2/' + images, './images_folder2/' + roco_names[idx] + '.jpg')
        
    ##check if the images folder already exist...if not create a new one
    if not os.path.exists('images'):
        os.mkdir('images')
        
    ###Finally,  lets copy the images to the original "images" folder ----
    print('\n Copying the new extracted images into the "images" folder \n')
    for images in os.listdir('images_folder2'):
        imgs = os.path.join('images_folder2/', images)
        shutil.copy(imgs, 'images')
        print(imgs)
        
    img_list = os.listdir('images')
    print('\n Amount of images in the images folder: {0} \n'.format(len(img_list)))
    
    ###delete the previous/initial files and folders created  --- they consume a large space
    folder_list = ['extract_tarfiles', 'images_folder2']
    for item in folder_list:
        try:
            shutil.rmtree(item)
        except:
            print('{0} already deleted'.format(item))
            continue
        else:
            print('Deleted {0}'.format(item))

In [14]:
##Get the validation dataset url_links
df_val = pd.read_csv(val_download_links, delimiter = '\t', header = None)

In [15]:
df_val.head()

Unnamed: 0,0,1,2
0,ROCO_00020,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,CRIONM2014-931546.003.jpg
1,ROCO_00027,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cios-1-176-g005.jpg
2,ROCO_00059,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,poljradiol-78-3-35-g001.jpg
3,ROCO_00062,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,p147_fig4a.jpg
4,ROCO_00068,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,CRIGM2017-1710501.002.jpg


In [16]:
df_val.shape

(8180, 3)

In [17]:
df_val.isnull().sum()

0    0
1    0
2    0
dtype: int64

In [18]:
df_val['url_list'] = df_val[1].apply(lambda x: x.split()[2])
df_val['pmc_id'] = df_val['url_list'].apply(lambda x: x.split('/')[-1][:-7])
df_val['img'] = df_val['pmc_id'] + '/' + df_val[2]

In [19]:
df_val.sort_values(by = 'pmc_id', inplace = True)

In [20]:
df_val.reset_index(inplace = True)

In [21]:
df_val.drop(['index'], axis = 1, inplace = True)

In [22]:
df_val.head()

Unnamed: 0,0,1,2,url_list,pmc_id,img
0,ROCO_70277,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cc2926-2.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1065025,PMC1065025/cc2926-2.jpg
1,ROCO_39258,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1471-2296-3-6-2.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC107839,PMC107839/1471-2296-3-6-2.jpg
2,ROCO_49332,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,pmed.0020079.g002.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1087206,PMC1087206/pmed.0020079.g002.jpg
3,ROCO_65330,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,pmed.0020154.g005.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1160569,PMC1160569/pmed.0020154.g005.jpg
4,ROCO_50186,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1471-2334-5-42-3.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1174869,PMC1174869/1471-2334-5-42-3.jpg


In [21]:
with open('url_list.txt', 'w') as file_handler:
    for item in df_val['url_list'].values.tolist():
        #print(item)
        file_handler.write("{}\n".format(item))

### --- Get the url_links for the test dataset

In [137]:
###os.chdir('/home/ec2-user/SageMaker')

In [23]:
df_test = pd.read_csv(test_download_links, delimiter = '\t', header = None)

In [24]:
df_test.head()

Unnamed: 0,0,1,2
0,ROCO_00001,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cro-0008-0385-g01.jpg
1,ROCO_00006,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,ol-11-05-3298-g02.jpg
2,ROCO_00016,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,cureus-0009-00000001639-i01.jpg
3,ROCO_00025,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,EJD-10-188-g001.jpg
4,ROCO_00031,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,PWKI-10-23517-g001.jpg


In [25]:
df_test.shape

(8179, 3)

In [26]:
##check for null values
df_test.isnull().sum()

0    0
1    0
2    0
dtype: int64

In [27]:
df_test['url_list'] = df_test[1].apply(lambda x: x.split()[2])
df_test['pmc_id'] = df_test['url_list'].apply(lambda x: x.split('/')[-1][:-7])
df_test['img'] = df_test['pmc_id'] + '/' + df_test[2]

In [28]:
df_test.sort_values(by = 'pmc_id', inplace = True)

In [29]:
df_test.reset_index(inplace = True)

In [30]:
df_test.drop(['index'], axis = 1, inplace = True)

In [31]:
df_test.head()

Unnamed: 0,0,1,2,url_list,pmc_id,img
0,ROCO_78739,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1471-2296-3-6-3.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC107839,PMC107839/1471-2296-3-6-3.jpg
1,ROCO_66766,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1477-7819-3-19-2.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1087895,PMC1087895/1477-7819-3-19-2.jpg
2,ROCO_23290,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1477-7819-3-29-2.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1156959,PMC1156959/1477-7819-3-29-2.jpg
3,ROCO_59505,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1471-2334-5-42-2.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1174869,PMC1174869/1471-2334-5-42-2.jpg
4,ROCO_81227,wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_...,1472-6815-5-4-3.jpg,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/...,PMC1180430,PMC1180430/1472-6815-5-4-3.jpg


In [87]:
with open('url_list.txt', 'w') as file_handler:
    for item in df_test['url_list'].values.tolist():
        #print(item)
        file_handler.write("{}\n".format(item))