# ETD Conversions - Color

###### To Check Cheat Sheet
  
If more than one variable, the total numbers must match expected values

- BitsPerSample: 3 - DONE
- ColorSpace: 1 - DONE 
- Compression: 2 - ERROR, 96 bitonal need group 4, FIXED 8/2/18
- Image Height + Width: 2-4, - ERROR, incorrect dimensions, FIXED 7/31/18
- X & Y Resolution: 2 - DONE
- Filename: check for anomalies to fix - ERROR, incorrect names, FIXED 7/31/18
- ICCProfile: no Dot Gain 20% - ERROR, 286 with Dot Gain 20%, FIXED 8/2/18

Also check
- that 4400 and 6600 width are rotated correctly - DONE
- that all directories have a .mrc file - DONE
- pagination - DONE
- for 400 ppi bitonal and 600 ppi gray or color files - DONE
- manual check of all gray and color files - PROBLEM - bad resolution of grayscale on Christenson_Christopher_2004_93-95, 98,99, moved to problems and recorded in spreadsheet
- file size match - DONE

In [1]:
current_project = '2020-Batch1-2_problems_HW-working-files/0.Complete'

In [2]:
# == display 100% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [28]:
# == Options and Importing

# == display 100% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# ===================================================================== IMPORT
# built-in
import logging
from math import isnan
from pathlib import Path
import shutil

# 3rd party
import cv2
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

# jeremy's
import img_qc.img_qc as img_qc

# ===================================================================== DIRECTORIES

# set current directory
current_directory = Path.cwd()

# in case we get fancy and change our current directory later
start_directory = current_directory

# set home directory
home_directory = Path.home()

# set etd directory
data_harddrive = Path('C:/00_DigitalImaging/ETD_quality-control/')

# set quality control directory as hard-coded: Path.cwd()/data/demo-images_no-crop/{current_project}
qc_directory = data_harddrive.joinpath(current_project)

# ===================================================================== OPTIONS

# MatPlotLib options
# parameters for matplotlib to increase our default figure size -- NOTE: figure sizes are in INCHES
plt.rcParams["figure.figsize"] = (20, 20)  # set as needed for your screen and eyes

# on a high-dpi monitor this will increase the quality of plots on-screen
# %config InlineBackend.figure_format = 'retina'

# Pandas options
pd.set_option('display.max_colwidth', -1)  # won't truncate output such as Path names

# Logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)

logging.info(f'QC Directory: {qc_directory}')

# ===================================================================== Set CSV info

# set CSV filename
csv_name = current_project + '.csv'

# set CSV path
csv_path = qc_directory.joinpath(csv_name)

# ===================================================================== LOAD IMAGE METADATA
if csv_path.is_file():
    logging.info(f'{csv_path} exists')
    logging.info(f'loading metadata from {csv_name}')
    
    # create DataFrame from CSV
    images_df = pd.read_csv(csv_path)

else:
    logging.info(f'{csv_name} does NOT exist')
    logging.info(f'loading metdata from {qc_directory}')
    
    # create DataFrame with Exiftools
    images_df = img_qc.get_images_df(str(qc_directory), 'tif')
    
    # write metadata to CSV file
    images_df.to_csv(csv_path)
    if csv_path.is_file():
        print(f'{csv_path} NOW EXISTS')

2020-12-18 18:53:00,596 - root - INFO - QC Directory: C:\00_DigitalImaging\ETD_quality-control\2020-Batch1-2_problems_HW-working-files\0.Complete
2020-12-18 18:53:00,597 - root - INFO - 2020-Batch1-2_problems_HW-working-files/0.Complete.csv does NOT exist
2020-12-18 18:53:00,598 - root - INFO - loading metdata from C:\00_DigitalImaging\ETD_quality-control\2020-Batch1-2_problems_HW-working-files\0.Complete
2020-12-18 18:54:02,406 - root - INFO - directory: C:\00_DigitalImaging\ETD_quality-control\2020-Batch1-2_problems_HW-working-files\0.Complete
2020-12-18 18:54:02,407 - root - INFO - number of images: 5235


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\00_DigitalImaging\\ETD_quality-control\\2020-Batch1-2_problems_HW-working-files\\0.Complete\\2020-Batch1-2_problems_HW-working-files\\0.Complete.csv'

In [29]:
# === DATAFRAME DROPDOWNS===
df_columns = list(images_df.columns.values)

def print_column(value, column):
    value = str(value)
    column = str(column)
    if value == 'SELECT A VALUE BELOW':
        pass
    else:
        print(f"# of images in {column_widget.kwargs['column']} == {value_widget.kwargs['value']}")
        print(len(images_df[images_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]))
        new_dataframe = images_df[images_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]
        return new_dataframe    
    
def select_column(column):
    column_values_list = []
    if column != 'SELECT A COLUMN':
        
        for column_value in images_df[column].unique():
            # check if it's a string
            if isinstance(column_value, str):
                # add to values list
                column_values_list.append(column_value)
            # skip it if it's a nan value
            # nan, or null, from Pandas breaks in the widgets as it's not
            # actually selectable because there's nothing there
            # Can rule it out by testing with the function math.isnan(x)
            elif isnan(column_value):
                pass
            # add whatever else there is
            else:
                column_values_list.append(column_value)
    # sort the list
    # print(column_values_list)
    print(f'# of Unique Values: {len(column_values_list)}')
    try:
        column_values_list = sorted(column_values_list)
    except TypeError:
        print('')
        print('DID NOT SORT VALUES -- MIXTURE OF STRING AND INTEGER IN COLUMN VALUES')
    column_values_list.insert(0, 'SELECT A VALUE BELOW')
    select2.options = column_values_list   

    
select1 = widgets.Dropdown(options=df_columns)
initial_value = 'SourceFile'
select2 = widgets.Dropdown(options=images_df[initial_value])
column_widget = widgets.interactive(select_column, column=select1)
value_widget = widgets.interactive(print_column, value=select2, column=select1)
display(column_widget)
display(value_widget)

# to access the column, use : column_widget.kwargs['column']
# to access the value, use: value_widget.kwargs['value']
# to access the dataframe where column == value: value_widget.result

interactive(children=(Dropdown(description='column', options=('SourceFile', 'ExifTool:ExifToolVersion', 'File:…

interactive(children=(Dropdown(description='value', options=('SELECT A VALUE BELOW', 'C:/00_DigitalImaging/ETD…

In [12]:
directory_list = sorted(ppi_400_color_df['File:Directory'].unique())

In [13]:
directory_list

['e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006',
 'e:/Batch2_2.toQC-Color_2/Coats_Christine_1997/Coats_Christine_1997',
 'e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012',
 'e:/Batch2_2.toQC-Color_2/Cody_Melissa_2002/Cody_Melissa_2002',
 'e:/Batch2_2.toQC-Color_2/Cohen_Katrina_2008/Cohen_Katrina_2008',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005',
 'e:/Batch2_2.toQC-Color_2/Collins_Margaret_2009/Collins_Margaret_2009',
 'e:/Batch2_2.toQC-Color_2/Colvin_Corrie_2007/Colvin_Corrie_2007',
 'e:/Batch2_2.toQC-Color_2/Connolly_Michael_2007/Connolly_Michael_2007',
 'e:/Batch2_2.toQC-Color_2/Contreras_Guadalupe_2001/Contreras_Guadalupe_2001',
 'e:/Batch2_2.toQC-Color_2/Conyers_Mindy_2003/Conyers_Mindy_2003',
 'e:/Batch2_2.toQC-Color_2/Cooke_Maria_2012/Cooke_Maria_2012',
 'e:/Batch2_2.toQC-Color_2/Cooksey_Matthew_2007/Cooksey_Matthew_2007',
 'e:/Batch2_2.toQC-Color_2/Copeland_Chad_2005/Copeland_Chad_2005',
 'e:/Batch2_2.toQC-Color_2/Crittenden_Elizab

In [5]:
# dataframe where XResolution == 400
ppi_400_df = images_df[images_df['EXIF:XResolution'] == 400]
                       
# dataframe where bits per sample is gray
ppi_400_gray_df = ppi_400_df[ppi_400_df['EXIF:BitsPerSample'] == '8']

In [6]:
ppi_400_color_df = ppi_400_df[ppi_400_df['EXIF:BitsPerSample'] == '8 8 8']

In [7]:
# === ppi_400_color_df DATAFRAME DROPDOWNS===
df_columns = list(ppi_400_color_df.columns.values)

def print_column(value, column):
    value = str(value)
    column = str(column)
    if value == 'SELECT A VALUE BELOW':
        pass
    else:
        print(f"# of images in {column_widget.kwargs['column']} == {value_widget.kwargs['value']}")
        print(len(ppi_400_color_df[ppi_400_color_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]))
        new_dataframe = ppi_400_color_df[ppi_400_color_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]
        return new_dataframe    
    
    
def select_column(column):
    column_values_list = []
    if column != 'SELECT A COLUMN':
        
        for column_value in ppi_400_color_df[column].unique():
            # check if it's a string
            if isinstance(column_value, str):
                # add to values list
                column_values_list.append(column_value)
            # skip it if it's a nan value
            # nan, or null, from Pandas breaks in the widgets as it's not
            # actually selectable because there's nothing there
            # Can rule it out by testing with the function math.isnan(x)
            elif isnan(column_value):
                pass
            # add whatever else there is
            else:
                column_values_list.append(column_value)
    # sort the list
    # print(column_values_list)
    print(f'# of Unique Values: {len(column_values_list)}')
    try:
        column_values_list = sorted(column_values_list)
    except TypeError:
        print('')
        print('DID NOT SORT VALUES -- MIXTURE OF STRING AND INTEGER IN COLUMN VALUES')
    column_values_list.insert(0, 'SELECT A VALUE BELOW')
    select2.options = column_values_list   

    
select1 = widgets.Dropdown(options=df_columns)
initial_value = 'SourceFile'
select2 = widgets.Dropdown(options=ppi_400_color_df[initial_value])
column_widget = widgets.interactive(select_column, column=select1)
value_widget = widgets.interactive(print_column, value=select2, column=select1)
display(column_widget)
display(value_widget)

# to access the column, use : column_widget.kwargs['column']
# to access the value, use: value_widget.kwargs['value']
# to access the dataframe where column == value: value_widget.result
# === DATAFRAME DROPDOWNS===


interactive(children=(Dropdown(description='column', options=('Composite:ImageSize', 'Composite:Megapixels', '…

interactive(children=(Dropdown(description='value', options=('SELECT A VALUE BELOW', '3400x4400', '4400x3400')…

In [5]:
# create a list template
show_color_list = []

# creates a list of images that fit requirements of one of my list options above
# only run this widget when the correct values are selected in the dataframe above
# these lists are below the dataframe because they don't need multiple selections
for image_path in value_widget.result['SourceFile']:

    if image_path not in show_color_list:
        show_color_list.append(image_path)

In [6]:
show_color_list

['e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0019.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0020.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0021.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0022.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0023.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0024.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0026.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0027.tif',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006/Clanton_Greg_2006_0031.tif',
 'e:/Batch2_2.toQC-Color_2/Coats_Christine_1997/Coats_Christine_1997/Coats_Christine_1997_0038.tif',
 'e:/Batch2_2.toQC-Color_2/Coats_Christine_1997/Coats_Christine_1997/Co

In [8]:
3651+257+454

4362

In [9]:
257+454

711

In [49]:
names_list = []

for filefilename in filefilename_list:
    if filefilename not in names_list:
        names_list.append(filefilename)
    else:
        print(filefilename)

In [50]:
names_list

[]

## ------- Math for quickly checking numbers -------

In [6]:
# first three are checking bit depth
# mismatch is in unique names, several directories have incorrect naming
# mismatch here, FileNames lists 4593, Unamed:0 is 4622, 4622 would be accurate.  Potential unnamed files?
4494-3743

751

In [20]:
# color + gray
291 + 460

751

In [19]:
# compression check
847 + 3647

4494

In [18]:
# how many bitonal need group 4 compression
847 - 751

96

## ------- Horizontal Page Rotation Check -------

In [5]:
width_4400_df = value_widget.result

### ask Jeremy why this works tomorrow 6/19/18

In [9]:
height_3400_df = value_widget.result

In [10]:
# https://stackoverflow.com/questions/26921943/pandas-intersection-of-two-data-frames-based-on-column-entries/26921975
image_list = pd.merge(width_3400_df, height_4400_df, how='inner', on=['SourceFile'])

NameError: name 'width_3400_df' is not defined

In [None]:
if len(image_list) == len(width_4400_df) and len(image_list) == len(height_3400_df):
    print('All images accounted for')

In [11]:
len(height_3400_df)

32

In [None]:
height_5100_df = value_widget.result

In [12]:
for image_path in height_3400_df['SourceFile']:
    path = Path(image_path)
    print(image_path)

e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0057.tif
e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0058.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0125.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0130.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0134.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0135.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0138.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0140.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0141.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0142.tif
e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012/Coder_Shelby_2012_0147.tif
e:/Batch2_2.toQC-Color_2/Colvin_Corrie_2007/Colvin

###########################################################

## ------- Dataframes -------

 The dataframes below separate bitonal, grayscale and color so they can be checked individually as needed.

To find and replace a dataframe below, select the cell but don't enter into it and click F to activate find and replace.  So ppi_400_color_df can be changed quickly to ppi_400_gray_df

In [7]:
# sorted list of SourceFile paths
file_list = sorted(list(images_df[images_df['EXIF:XResolution'] == 600]['SourceFile']))

# dataframe where XResolution == 600
ppi_600_df = images_df[images_df['EXIF:XResolution'] == 600]

In [8]:
# dataframe where XResolution == 400
ppi_400_df = images_df[images_df['EXIF:XResolution'] == 400]
                       
# dataframe where bits per sample is color
ppi_400_color_df = ppi_400_df[ppi_400_df['EXIF:BitsPerSample'] == '8 8 8']

In [9]:
# dataframe where XResolution == 400
ppi_400_df = images_df[images_df['EXIF:XResolution'] == 400]
                       
# dataframe where bits per sample is gray
ppi_400_gray_df = ppi_400_df[ppi_400_df['EXIF:BitsPerSample'] == '8']

In [10]:
# this listmust be called before the dataframe below in order to add multiple selections, otherwise will erase previous additions
# use this to fix height and width
images_to_fix_list = []

In [11]:
# dataframe where XResolution == 400
gray_gamma_df = images_df[images_df['XMP:ICCProfileName'] == 'Gray Gamma 2.2']

In [24]:
# === ppi_400_color_df DATAFRAME DROPDOWNS===
df_columns = list(ppi_400_color_df.columns.values)

def print_column(value, column):
    value = str(value)
    column = str(column)
    if value == 'SELECT A VALUE BELOW':
        pass
    else:
        print(f"# of images in {column_widget.kwargs['column']} == {value_widget.kwargs['value']}")
        print(len(ppi_400_color_df[ppi_400_color_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]))
        new_dataframe = ppi_400_color_df[ppi_400_color_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]
        return new_dataframe    
    
    
def select_column(column):
    column_values_list = []
    if column != 'SELECT A COLUMN':
        
        for column_value in ppi_400_color_df[column].unique():
            # check if it's a string
            if isinstance(column_value, str):
                # add to values list
                column_values_list.append(column_value)
            # skip it if it's a nan value
            # nan, or null, from Pandas breaks in the widgets as it's not
            # actually selectable because there's nothing there
            # Can rule it out by testing with the function math.isnan(x)
            elif isnan(column_value):
                pass
            # add whatever else there is
            else:
                column_values_list.append(column_value)
    # sort the list
    # print(column_values_list)
    print(f'# of Unique Values: {len(column_values_list)}')
    try:
        column_values_list = sorted(column_values_list)
    except TypeError:
        print('')
        print('DID NOT SORT VALUES -- MIXTURE OF STRING AND INTEGER IN COLUMN VALUES')
    column_values_list.insert(0, 'SELECT A VALUE BELOW')
    select2.options = column_values_list   

    
select1 = widgets.Dropdown(options=df_columns)
initial_value = 'SourceFile'
select2 = widgets.Dropdown(options=ppi_400_color_df[initial_value])
column_widget = widgets.interactive(select_column, column=select1)
value_widget = widgets.interactive(print_column, value=select2, column=select1)
display(column_widget)
display(value_widget)

# to access the column, use : column_widget.kwargs['column']
# to access the value, use: value_widget.kwargs['value']
# to access the dataframe where column == value: value_widget.result
# === DATAFRAME DROPDOWNS===


interactive(children=(Dropdown(description='column', options=('Unnamed: 0', 'Composite:ImageSize', 'Composite:…

interactive(children=(Dropdown(description='value', options=('SELECT A VALUE BELOW', 288, 289, 290, 291, 292, …

## ------- A List of Lists -------

Things I've used before and may want to again, plus a template

In [8]:
group_5_compression = []

In [5]:
# Use this to find files with an ICC profile of Dot Gain 20%
dot_gain_list = []

In [None]:
# Use this to find files that are 400 ppi and bitonal
bitonal_400_list = []

In [None]:
# Use this to find files that should be group 4 compression
uncompressed_bitonal_list = []

In [25]:
# Use this to find files that are grayscale to manually check
move_gray_list = []

# Use this to find files that are color to manually check
move_color_list = []

In [14]:
#list of grayscale images
gray_images_list = [filename for filename in ppi_400_gray_df['SourceFile']]
gray_images_list

['e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0050.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0051.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0052.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0054.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0056.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0057.tif',
 'e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003/Chhopel_G_2003_0058.tif',
 'e:/Batch2_2.toQC-Color_2/Chiroro_Dominic_2012/Chiroro_Dominic_2012/Chiroro_Dominic_2012_0016.tif',
 'e:/Batch2_2.toQC-Color_2/Chiroro_Dominic_2012/Chiroro_Dominic_2012/Chiroro_Dominic_2012_0017.tif',
 'e:/Batch2_2.toQC-Color_2/Chiroro_Dominic_2012/Chiroro_Dominic_2012/Chiroro_Dominic_2012_0019.tif',
 'e:/Batch2_2.toQC-Color_2/Chiroro_Dominic_2012/Chiroro_Dominic_2012/Chiroro_Dominic_2012_0020.tif',
 'e:/Batch2_2.t

In [25]:
what_is_not_gray_gamma_list = [filename for filename in gray_images_list if filename not in gray_gamma_list]
what_is_not_gray_gamma_list

NameError: name 'gray_images_list' is not defined

In [27]:
images_uncompressed_bitonal_list

['e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0001.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0002.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0003.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0004.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0005.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0006.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0007.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0008.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0009.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0010.tif',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0011.tif',
 'e:/Batch2_2.toQC-Co

In [9]:
# create a list template
show_color_list = []

# creates a list of images that fit requirements of one of my list options above
# only run this widget when the correct values are selected in the dataframe above
# these lists are below the dataframe because they don't need multiple selections
for image_path in value_widget.result['SourceFile']:

    if image_path not in show_color_list:
        show_color_list.append(image_path)

In [10]:
show_color_list

['e:/Batch2_2.toQC-Color_2/Denena_Melissa_2002/Denena_Melissa_2002/Denena_Melissa_2002_0012.tif']

In [28]:
len(move_color_list)

460

####################################################################################################

## ------- Tools -------

In [None]:
# This resizes 400 ppi images to the correct height and weight and saves them
# Check orientation before running this code
for image_path in images_to_fix_list:
    path = Path(image_path)
    
    print(f'opening {path.name}')
    
    image = Image.open(image_path)
    
    image = image.crop(box=(0, 0, 4400, 3400))
    image.save(image_path, compression='None', dpi=(400.0, 400.0))

In [31]:
#saves the images in the list with group 4 compression at 600 ppi
for image_path in images_uncompressed_bitonal_list:
    path = Path(image_path)
    print(image_path)
    
    image = Image.open(image_path)
    image = image.resize((5100, 6600), resample=Image.LANCZOS)
    print(f'saving {path.name}')
    image.save(image_path, compression='group4', dpi=(600.0, 600.0))

e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0001.tif
saving Coker_Linda_2005_0001.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0002.tif
saving Coker_Linda_2005_0002.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0003.tif
saving Coker_Linda_2005_0003.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0004.tif
saving Coker_Linda_2005_0004.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0005.tif
saving Coker_Linda_2005_0005.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0006.tif
saving Coker_Linda_2005_0006.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0007.tif
saving Coker_Linda_2005_0007.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0008.tif
saving Coker_Linda_2005_0008.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_20

e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0077.tif
saving Coker_Linda_2005_0077.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0078.tif
saving Coker_Linda_2005_0078.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0079.tif
saving Coker_Linda_2005_0079.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0080.tif
saving Coker_Linda_2005_0080.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0081.tif
saving Coker_Linda_2005_0081.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0082.tif
saving Coker_Linda_2005_0082.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0083.tif
saving Coker_Linda_2005_0083.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005/Coker_Linda_2005_0086.tif
saving Coker_Linda_2005_0086.tif
e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_20

In [None]:
len(bitonal_400_list)

In [None]:
# creates a list of images where height and width are NOT the expected 3400 or 4400
# run this widget when EXIF height or width values are selected in a ppi_400_gray_df dataframe above
for image_path in value_widget.result['SourceFile']:
    #image = Image.open(image_path)
    #plt.figure()
    #plt.title(image_path)
    #plt.imshow(image)
    
    if image_path not in images_to_fix_list:
        images_to_fix_list.append(image_path)

In [17]:
# lets me know if if the number of files in my list match
len(bitonal_400_list), len(bitonal_400_list_copied)

NameError: name 'bitonal_400_list_copied' is not defined

In [None]:
# crops an image to the correct dimensions and saves at the correct resolution
# important to remember, the crop is from the upper right corner, not evenly on both sides.
# use another method for cropping more than a few pixels, bitonal method is above
for image_path in images_to_fix_list:
    path = Path(image_path)
    
    print(f'opening {path.name}')
    
    image = Image.open(image_path)
    
    image = image.crop(box=(0, 0, 4400, 3400))
    image.save(image_path, compression='None', dpi=(400.0, 400.0))

In [None]:
# moves pdfs to separate folder
pdf_directory = 'e:Batch2_2.toQC-Color_1/000_pdfs/'
pdf_list = list(qc_directory.glob('**/*.pdf'))
for pdf in pdf_list:
    shutil.move(str(pdf), pdf_directory)

In [29]:
out_path = 'e:/Batch2_2.toQC-Color_2/0_to_fix/'

In [30]:
check = Path(out_path)
check

WindowsPath('e:/Batch2_2.toQC-Color_2/0_to_fix')

In [31]:
# don't run again -- copies images!
for image in move_color_list:
    shutil.copy(image, out_path)

In [None]:
# find bitonal images that are 400ppi
ppi_400_bitonal_df = ppi_400_df[ppi_400_df['EXIF:BitsPerSample'] == 1]

# list unique directories with 400 ppi bitonal
ppi_400_bitonal_df['File:Directory'].unique()

In [5]:
# okay, looking at this from a QC perspective:
# letter-size 8-bit 400ppi color: 42mb < x < 45mb
# letter-size 8-bit 400ppi grayscale: 13mb < x < 16mb
# letter-size 1-bit 600ppi bitonal: 0b < x < 1mb

# therefore
assume_bitonal_df = images_df[images_df['File:FileSize'] < 1048576 ]
# grayscale and color; just do the range as 2 processes because I know it works
assume_gray_df = images_df[images_df['File:FileSize'] > 13631488]
assume_gray_df = assume_gray_df[assume_gray_df['File:FileSize'] < 16777216]
assume_color_df = images_df[images_df['File:FileSize'] > 44040192]
assume_color_df = assume_color_df[assume_color_df['File:FileSize'] < 47185920]

total_assumed_bitonal = len(assume_bitonal_df)
total_assumed_gray = len(assume_gray_df)
total_assumed_color = len(assume_color_df)
total_assumed = total_assumed_bitonal + total_assumed_color + total_assumed_gray

print(f"""assume bitonal (0-1mb): {total_assumed_bitonal}
assume gray (13-16mb): {total_assumed_gray}
assume color (42-45mb): {total_assumed_color}
*************************
total assumed: {total_assumed}
total images: {len(images_df)}
*************************
images to classify: {len(images_df) - total_assumed}""")

assume bitonal (0-1mb): 3743
assume gray (13-16mb): 293
assume color (42-45mb): 458
*************************
total assumed: 4494
total images: 4494
*************************
images to classify: 0


#####################################################################################################

## To fix file naming errors


In [6]:
# dropdown to see purpose
filenaming_error_list = []
def check_directory_for_first_tiff(directory,
                                   with_extension,
                                   zeropad=4,
                                  ):
    """-- Purpose --
    Check that directory contains 1 tif per number in the form:
    <directory_name>_0001.tif
    <directory_name>_nnnn.tif
    
    -- Arguments --
    in_directory: the directory to check for one image per name
    with_extensions: takes a list of extensions; must be 1+
    zeropad: number of digits to zeropad
    
    -- Returns -- 
    TODO"""
    
    # get Path to directory
    directory_to_check = Path(directory)
    
    if directory_to_check.is_dir():
    
        # get directory name by indexing the last part of the Path before the name
        directory_name = directory_to_check.parts[-1]
    
        # Log the directory name
        logging.info(f'... checking {directory_name} . . .')
        
        # get formatted extension
        extension = img_qc.get_formatted_extension(with_extension)
        
        # create file name in form <directory_name>_<zeropad><n>.suffix
        file_name_to_test = directory_name + '_' + str(1).zfill(zeropad) + extension
        
        # create file path
        file_path_to_test = Path(directory).joinpath(file_name_to_test)
    
        # create list of all files in directory
        directory_contents_list = list(directory_to_check.glob('**/*'))
    
        if file_path_to_test.is_file():
            # can process image futher here if necessary
            pass
        else:
            # log error 
            logging.error(f'Filenaming ERROR in {directory}')
            # MAJOR ERROR: extra/missing files in the folder with correct extension
            # this could be where we move this folder into a new location
            # or create/add to a problems.txt for tracking
            filenaming_error_list.append(directory)
    
    # else we error
    else:
        logging.error(f'NOT A DIRECTORY: {in_directory}')

In [7]:
directory_list = sorted(images_df['File:Directory'].unique())
for directory in directory_list:
    check_directory_for_first_tiff(directory, 'tif')

2018-08-29 12:10:22,281 - root - INFO - ... checking Chhopel_G_2003 . . .
2018-08-29 12:10:22,323 - root - INFO - ... checking Chiroro_Dominic_2012 . . .
2018-08-29 12:10:22,356 - root - INFO - ... checking Christenson_Christopher_2004 . . .
2018-08-29 12:10:22,457 - root - INFO - ... checking Clanton_Greg_2006 . . .
2018-08-29 12:10:22,478 - root - INFO - ... checking Coats_Christine_1997 . . .
2018-08-29 12:10:22,551 - root - INFO - ... checking Coder_Shelby_2012 . . .
2018-08-29 12:10:22,619 - root - INFO - ... checking Cody_Melissa_2002 . . .
2018-08-29 12:10:22,624 - root - INFO - ... checking Cohen_Katrina_2008 . . .
2018-08-29 12:10:22,645 - root - INFO - ... checking Coker_Linda_2005 . . .
2018-08-29 12:10:22,666 - root - INFO - ... checking Cole_Dennis_2003 . . .
2018-08-29 12:10:22,702 - root - INFO - ... checking Collins_Brian_1997 . . .
2018-08-29 12:10:22,756 - root - INFO - ... checking Collins_Margaret_2009 . . .
2018-08-29 12:10:22,774 - root - INFO - ... checking Colvi

In [8]:
filenaming_error_list

[]

In [None]:
def rename_directory(directory, extension, zeropad=4):
    
    # get Path to directory
    directory_to_rename = Path(directory)
    
    # get formatted extension
    extension = img_qc.get_formatted_extension(extension)
    
    if directory_to_rename.is_dir():
    
        # get directory name by indexing the last part of the Path before the name
        directory_name = directory_to_rename.parts[-1]
        
        print(directory_name)
        
        # create glob abbreviation
        files_to_glob = '*' + extension
        
        # get list of files in directory
        file_list = directory_to_rename.glob(files_to_glob)
        
        # sort file list
        file_list = sorted(file_list)
        
        # iterate over file list
        for index, file in enumerate(file_list, start=1):
            
            # set new filename
            new_filename = directory_name + '_' + str(index).zfill(zeropad*2) + extension
            
            # get old path
            old_path = Path(file)
            
            # set new path
            new_path = directory_to_rename.joinpath(new_filename)
            
            # rename files
            old_path.rename(new_path)
        
        # get list of files in directory
        file_list = directory_to_rename.glob(files_to_glob)
        
        # sort file list
        file_list = sorted(file_list)
            
        # iterate over file list
        for index, file in enumerate(file_list, start=1):
            
            # set new filename
            new_filename = directory_name + '_' + str(index).zfill(zeropad) + extension
            
            # get old path
            old_path = Path(file)
            
            # set new path
            new_path = directory_to_rename.joinpath(new_filename)
            
            # rename files
            old_path.rename(new_path)

In [None]:
for directory in filenaming_error_list:
    rename_directory(directory, 'tif')

#####################################################################################################

## MRC record and Pagination check

In [28]:
directories_list = sorted(images_df['File:Directory'].unique())
directories_list

['e:/Batch2_2.toQC-Color_2/Chhopel_G_2003/Chhopel_G_2003',
 'e:/Batch2_2.toQC-Color_2/Chiroro_Dominic_2012/Chiroro_Dominic_2012',
 'e:/Batch2_2.toQC-Color_2/Christenson_Christopher_2004/Christenson_Christopher_2004',
 'e:/Batch2_2.toQC-Color_2/Clanton_Greg_2006/Clanton_Greg_2006',
 'e:/Batch2_2.toQC-Color_2/Coats_Christine_1997/Coats_Christine_1997',
 'e:/Batch2_2.toQC-Color_2/Coder_Shelby_2012/Coder_Shelby_2012',
 'e:/Batch2_2.toQC-Color_2/Cody_Melissa_2002/Cody_Melissa_2002',
 'e:/Batch2_2.toQC-Color_2/Cohen_Katrina_2008/Cohen_Katrina_2008',
 'e:/Batch2_2.toQC-Color_2/Coker_Linda_2005/Coker_Linda_2005',
 'e:/Batch2_2.toQC-Color_2/Cole_Dennis_2003/Cole_Dennis_2003',
 'e:/Batch2_2.toQC-Color_2/Collins_Brian_1997/Collins_Brian_1997',
 'e:/Batch2_2.toQC-Color_2/Collins_Margaret_2009/Collins_Margaret_2009',
 'e:/Batch2_2.toQC-Color_2/Colvin_Corrie_2007/Colvin_Corrie_2007',
 'e:/Batch2_2.toQC-Color_2/Connell_Wesley_2009/Connell_Wesley_2009',
 'e:/Batch2_2.toQC-Color_2/Connolly_Michael_2007

In [29]:
for directory in directories_list:
    img_qc.check_directory_for_one_number_per_file(directory, '.tif')

2018-08-15 16:36:34,130 - root - INFO - ... checking Chhopel_G_2003 . . .
2018-08-15 16:36:34,137 - root - INFO - ... checking Chiroro_Dominic_2012 . . .
2018-08-15 16:36:34,143 - root - INFO - ... checking Christenson_Christopher_2004 . . .
2018-08-15 16:36:34,153 - root - INFO - ... checking Clanton_Greg_2006 . . .
2018-08-15 16:36:34,158 - root - INFO - ... checking Coats_Christine_1997 . . .
2018-08-15 16:36:34,170 - root - INFO - ... checking Coder_Shelby_2012 . . .
2018-08-15 16:36:34,181 - root - INFO - ... checking Cody_Melissa_2002 . . .
2018-08-15 16:36:34,188 - root - INFO - ... checking Cohen_Katrina_2008 . . .
2018-08-15 16:36:34,193 - root - INFO - ... checking Coker_Linda_2005 . . .
2018-08-15 16:36:34,201 - root - INFO - ... checking Cole_Dennis_2003 . . .
2018-08-15 16:36:34,215 - root - INFO - ... checking Collins_Brian_1997 . . .
2018-08-15 16:36:34,228 - root - INFO - ... checking Collins_Margaret_2009 . . .
2018-08-15 16:36:34,233 - root - INFO - ... checking Colvi

2018-08-15 16:36:34,366 - root - INFO - ... checking DeLeon_Robert_2010 . . .
2018-08-15 16:36:34,379 - root - INFO - ... checking DeSimone_Ivy_2001 . . .
2018-08-15 16:36:34,384 - root - INFO - ... checking Deal_Julia_2002 . . .
2018-08-15 16:36:34,392 - root - INFO - ... checking Dean_Roy_2002 . . .
2018-08-15 16:36:34,397 - root - INFO - ... checking Dekar_Aicha_2000 . . .
2018-08-15 16:36:34,403 - root - INFO - ... checking Denena_Melissa_2002 . . .
2018-08-15 16:36:34,409 - root - INFO - ... checking Dennis_Kristine_2003 . . .
2018-08-15 16:36:34,415 - root - INFO - ... checking DiMichele_Daniel_2010 . . .
2018-08-15 16:36:34,420 - root - INFO - ... checking Diaz_Peter_2010 . . .
2018-08-15 16:36:34,426 - root - INFO - ... checking Dickerson_Brian_2010 . . .
2018-08-15 16:36:34,433 - root - INFO - ... checking Dickerson_Candice_2008 . . .
2018-08-15 16:36:34,442 - root - INFO - ... checking Dix_Ross_2008 . . .
2018-08-15 16:36:34,448 - root - INFO - ... checking Dixson_Jamie_2001 

In [20]:
for directory in directories_list:
    directory_path = Path(directory)
    mrc_path = directory_path.joinpath(directory_path.name + '.mrc')
    if not mrc_path.is_file():
        print(f'directory {directory} missing mrc record')
    else:
        print('YAY!!')

YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!
YAY!!


In [30]:
# === DATAFRAME DROPDOWNS===
df_columns = list(images_df.columns.values)

def print_column(value, column):
    value = str(value)
    column = str(column)
    if value == 'SELECT A VALUE BELOW':
        pass
    else:
        print(f"# of images in {column_widget.kwargs['column']} == {value_widget.kwargs['value']}")
        print(len(images_df[images_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]))
        new_dataframe = images_df[images_df[column_widget.kwargs['column']]==value_widget.kwargs['value']]
        return new_dataframe    
    
    
def select_column(column):
    column_values_list = []
    if column != 'SELECT A COLUMN':
        
        for column_value in images_df[column].unique():
            # check if it's a string
            if isinstance(column_value, str):
                # add to values list
                column_values_list.append(column_value)
            # skip it if it's a nan value
            # nan, or null, from Pandas breaks in the widgets as it's not
            # actually selectable because there's nothing there
            # Can rule it out by testing with the function math.isnan(x)
            elif isnan(column_value):
                pass
            # add whatever else there is
            else:
                column_values_list.append(column_value)
    # sort the list
    # print(column_values_list)
    print(f'# of Unique Values: {len(column_values_list)}')
    column_values_list = sorted(column_values_list)
    column_values_list.insert(0, 'SELECT A VALUE BELOW')
    select2.options = column_values_list   

    
select1 = widgets.Dropdown(options=df_columns)
initial_value = 'SourceFile'
select2 = widgets.Dropdown(options=images_df[initial_value])
column_widget = widgets.interactive(select_column, column=select1)
value_widget = widgets.interactive(print_column, value=select2, column=select1)
display(column_widget)
display(value_widget)

# to access the column, use : column_widget.kwargs['column']
# to access the value, use: value_widget.kwargs['value']
# to access the dataframe where column == value: value_widget.result

interactive(children=(Dropdown(description='column', options=('Unnamed: 0', 'Composite:ImageSize', 'Composite:…

interactive(children=(Dropdown(description='value', options=('SELECT A VALUE BELOW', 0, 1, 2, 3, 4, 5, 6, 7, 8…

## Tips

When showing an image, if the output cell shows a compressed scroll window instead of a full page view, move the mouse to the left below the Out cell and click when a gray rectangle is highlighted in the margin.  It will expand the cell to show the full page.

In [None]:
# check images for first page and pagination stuff

directory_df = value_widget.result

image_list = sorted(directory_df['SourceFile'])
print(f'Images in List: {len(image_list)}')
x=12
for index, image_path in enumerate(image_list, start=1):
    if index > 5 and index < x:
        # open image
        image = Image.open(image_path)
    
        plt.figure()
        plt.title(image_path)
        plt.imshow(image)

image_path = image_list[-3]
image = Image.open(image_path)
plt.figure()
plt.title(image_path)
plt.imshow(image)
        
image_path = image_list[-2]
image = Image.open(image_path)
plt.figure()
plt.title(image_path)
plt.imshow(image)
        
image_path = image_list[-1]
image = Image.open(image_path)
plt.figure()
plt.title(image_path)
plt.imshow(image)