In [None]:
from PIL import Image, ImageOps
from pathlib import Path
import numpy as np
import skimage.filters
import scipy.ndimage
import pandas as pd
import skimage.transform
import more_itertools as mit
import math
import os

###################
# pot_scan_to_pkl #
###################
# A function to convert scanned pages into a pkl file of separate Image Arrays

# Parameters:
#              input_page = .png or .tiff file to be processed
#               threshold = Value from 0-255, all pixels < the value go to 0, all pixels >= the value go to
#                           255. If value is not specified it will be automatically determined using Otsu
#                           thresholding. Manual input of threshold only reccommended if Otsu threshold 
#                           doesn't provide satisfactory results.
#   diagonal_pixel_groups = If False, only horizontally and vertically connected pixels form labels,
#                           if True, horizontally, vertically, and diagonally connected pixels can.
#         viable_pot_size = The minimum number of pixels required for a label to be recognised as a
#                           potential pot. By default it is 2000, though this may need to be raised or
#                           lowered manually depending on the nature of the input image.
#              output_pkl = If True, output df is converted to a .pkl file and saved,
#                           If False, output is temporarily stored as func_output_df
#                   nomen = If nomen is given, this is used to name the output, otherwise based on input page filename


def pot_scan_to_pkl(input_page, threshold=None, diagonal_pixel_groups=True, viable_pot_size=4000, output_pkl=False, nomen=None):
    
    # Opens the input image, converts it to 8bit greyscale, inverts the colours,
    # adds a border and converts to a numpy array
    
    scanned_page = Image.open(input_page)
    scanned_page = scanned_page.convert('L')
    scanned_page = ImageOps.invert(scanned_page)
    scanned_page = ImageOps.expand(scanned_page, border=10, fill='black')
    scanned_page = np.array(scanned_page)
    
    # If a threshold has been provided, all pixels >= are turned to white, all
    # pixels < are turned to black. If threshold is not provided, one is automatically
    # produced through Otsu threshold filtering and applied similarly to produce
    # a binary image.
    
    if threshold:
        scanned_page[scanned_page >= threshold] = 255
        scanned_page[scanned_page <  threshold] = 0
    else:
        otsu_value = skimage.filters.threshold_otsu(scanned_page)
        scanned_page[scanned_page >= otsu_value] = 255
        scanned_page[scanned_page <  otsu_value] = 0
        print ('otsu_threshold =', otsu_value)
    
    # Creates a structure to allow or disallow diagonally adjacent pixel groups to form
    # labels in the next function.
    
    if diagonal_pixel_groups is True:
        s = [[255,255,255],
             [255,255,255],
             [255,255,255]]
    elif diagonal_pixel_groups is False:
        s = [[0,  255,0  ],
             [255,255,255],
             [0,  255,0  ]]
    
    # Creates a labelled array whereby each individual group of adjacent non-zero pixels
    # is allocated a distinct individual integer, then creates a list of tuples detailing
    # the integer assigned to each label, and the number of pixels that make up each label.
    
    labelled_array, num_features = scipy.ndimage.measurements.label(scanned_page, structure=s)
    unique_labels, pixels_per_label = np.unique(labelled_array, return_counts=True)                                         
    unique_labels = tuple(unique_labels)
    pixels_per_label = tuple(pixels_per_label)
    labels_with_pixels = list(zip(unique_labels,pixels_per_label))
    
    # Iterates over the list of labels to see if any are associated with enough pixels
    # (as determined by the viable_pot_size variable) to be possible pots. Smaller labels
    # tend to be things like individual letters of text or page numbers. Any sufficiently
    # large labels are then appended to a the possible_pots list. The function then prints
    # the number of pots that have been found to make sure this matches expectations.
    
    possible_pots = []
    for (x,y) in labels_with_pixels:
        if y>=viable_pot_size:                  
            possible_pots.append(x)
    
    print ("Pots found =", len(possible_pots)-1)
    
    # Creates a reference image of the scanned page with everything but the possible pot
    # labels removed by iterating over the possible_pots list and removing any labels that
    # don't feature on it. This image can then be used to help manually identify the image
    # outputs on the basis of shape, surviving decoration, artefacts of scanning etc. and
    # to ensure that all the pots have been properly scanned. (the Path function grabs the filename from the file location)
    
    reference_image = labelled_array
    for x in possible_pots:
        if x > 0:
            reference_image = np.where(reference_image == x, 255, reference_image)
    reference_image[reference_image > 255] = 0
    reference_image[reference_image < 255] = 0
    reference_image = Image.fromarray(reference_image)
    reference_image = reference_image.convert('L')
    
    if nomen is None:
        nomen = Path(input_page).stem
    reference_image.save('reference_image_%s.png' % nomen)
    
    # Creates an empty output list, then iterates over all the possible non-zero pot labels.
    # For each label, the function turns other labels to 0, crops around the remaining
    # non-zero pixels, turns all remaining non-zero pixels to 255 (hopefully leaving just
    # the pot) then appends the array to the output list.
    
    output_pots = []
    for x in possible_pots:
        if x > 0:
            pot = np.where(labelled_array == x, x, 0)
            pot = pot[np.ix_(pot.any(1),pot.any(0))]
            pot[pot > 0] = 255
            output_pots.append(pot)
    
    # Converts the output_pots list to a dataframe, then saves as a .pkl file with the
    # Same name as the input file. Scans with only a single pot seem to need the output
    # pots list to be more nested, I'm not entirely sure why. Also prints an error
    # statement if < 1 pot is found
    
    if len(output_pots) > 1:
        pot_data_frame = pd.DataFrame(output_pots, columns = ['Image Array'])
    elif len(output_pots) == 1:
        pot_data_frame = pd.DataFrame([output_pots], columns = ['Image Array'])
    else:
        print ("SOMETHING HAS CLEARLY GONE WRONG - LESS THAN ONE POT FOUND")
        
    # If an output pkl is needed, one is produced, otherwise the data is temporarily stored as an output df
        
    if output_pkl is True:
        pot_data_frame.to_pickle('%s.pkl' % filename)
    return pot_data_frame



######################    
# new_autorotate_all #
######################
# A function to automatically rotate pots to an upright position, or 90/180 degrees from an upright position.
# It may be advisable to run this function several times, starting with a broad degree range and high degree
# increment, then lowering both values once a more accurate range has been established
#
#
# This new version takes into account cases with thick centre lines where there
# might be multiple rotations that offer a max col value by reiterating over the
# values, rather than just selecting the first one.
#
# NOTE - At some point you should do something vaguely statistical to figure out a senhsible minmum degree
#        increment - difference seems to be fairly minimal once you go beyond 1/8th of a degree
#
# Parameters:
#         input_data = A data frame or .pkl file with the images requiring rotation stored in the 'Image Array' column
#       degree_range = An integer value determining the max degrees either side of 0, 90 and -90 that will
#                      be trialled during autorotation. For very lopsided images consider expanding beyond
#                      10, but this is not usually necessary
#   degree_increment = The number of degrees the image is rotated by in each attempt. It is advisable to keep
#                      this value as 1/(2^n) to avoid rounding errors:
#                         1/8 : 0.125
#                        1/16 : 0.0625
#                        1/32 : 0.03125
#                        1/64 : 0.015625
#                       1/128 : 0.0078125
#                       1/256 : 0.00390625
#                       1/512 : 0.001953125
#                      1/1024 : 0.0009765625
# ignore_horizontals = If True, only vertical lines are checked for straightness
#         output_pkl = If True, output df is converted to a .pkl file and saved,
#                      If False, output is temporarily stored as func_output_df
#              nomen = If nomen is given, this is used to name the output, otherwise ..............

def new_autorotate_all(input_data, degree_range=10,degree_increment=1/8, ignore_horizontals=False, output_pkl=False, nomen=None):
    
    # Reads the .pkl file as a data frame if necessary, creates a list of indices and an
    # empty rotated pot list to fill with outputs. Also creates a list of
    # degrees to try rotating the images too based on the function parameters.
    # This list of degrees always comprises of those up to the degree range
    # clockwise and counterclockwise of the current orientation, and if ignore
    # horizontals is set to false, it includes increments around a 90 degree rotation
    # as well (so that long vessels can be righted by using the top horizontal line).
    # Inculding horiztonal lines can however be problematic for wonky vessels.

    if type(input_data) == str:
        data_frame = pd.read_pickle(input_pickle)
    elif isinstance(input_data,pd.DataFrame):
        data_frame = input_data
    index_list = list(data_frame.index.values)
    rotated_pot_list =[]
    
    bot_degree_list = np.arange(-90, -90+degree_range+degree_increment, degree_increment).tolist()
    mid_degree_list = np.arange(-degree_range, degree_range+degree_increment, degree_increment).tolist()
    top_degree_list = np.arange(90-degree_range, 90+degree_increment, degree_increment).tolist()
    
    if ignore_horizontals == False:
        degree_list = bot_degree_list+mid_degree_list+top_degree_list
        
    elif ignore_horizontals == True:
        degree_list = mid_degree_list
    
    # For each image array (obtained through iterating over the data frame using the index list), 
    # iterates over the list of degrees, each time rotating the original pot image by the degree
    # under consideration (rotating the original image rather than repeating rotation avoids
    # distortion) then for each rotation records the sum of all the values for each column, takes
    # the largest sum, and appends it to the value_list.
    #
    # The logic here is that we are finding the rotation that gives us the straightest vertical line. This will
    # usually be either the vertical line separating the two halves of the typological image, or the straight line
    # running horizontally along the top of a shorter, wider vessel like a plate or a lid. So making this line as
    # straight as possible will usually result in an image of a pot that is upright, upside down, or on its side
    # In the latter two cases the pot can then be manually rotated by a multiple of 90 degrees to achieve uprightness.
    
    for x in index_list:
        pot_array = data_frame.at[x, 'Image Array']
        value_list = []
        for y in degree_list:
            temp_pot = skimage.transform.rotate(pot_array, angle=y, resize=True, center=None, preserve_range=True, order=0)
            temp_columns = temp_pot.sum(axis=0)
            temp_columns = sorted(temp_columns, reverse=True)
            temp_column_max = max(temp_columns)
            value_list.append(temp_column_max)
            
        # Creates a smaller degree list, containing only those rotations that result in the highest value (straightest)
        # vertical line.

        rotation_indices = ([pos for pos, val in enumerate(value_list) if val == max(value_list)])
        short_degree_list = []
        for i in rotation_indices:
            short_degree_list.append(degree_list[i])
        
        # If there is only one rotation that produces the highest value vertical line, the function continues. If multiple rotations
        # however are able to produce the same max value (usually becuase the centre line is thick enough that it can still be slightly
        # wonky but still be giving a similarly hiugh value to a perfectly straight line), then more processing is needed. This while
        # statement iterates over the list of rotation degrees that produce the max value, then records the top 2 column values for that
        # rotation and sums them (the logic being that a n pixel thick central line would have multiple high value columns, whereas a
        # wonky one may just have one or two). Only the rotations with the highest 2 column sum are preserved. If there is now only one
        # rotation left, it is taken forward, otherwise the process repeats taking the highests 3,4,5...n highest column values until
        # only one rotation remains
        
        counter = 0
        while len(short_degree_list) > 1:
            new_value_list = []
            for d in short_degree_list:
                temp_pot = skimage.transform.rotate(pot_array, angle=d, resize=True, center=None, preserve_range=True, order=0)
                temp_columns = temp_pot.sum(axis=0)
                temp_columns = sorted(temp_columns, reverse=True)
                new_temp_column_max = temp_columns[:counter+1]
                new_temp_column_max = sum(new_temp_column_max)
                new_value_list.append(new_temp_column_max)
            rotation_indices = ([pos for pos, val in enumerate(new_value_list) if val == max(new_value_list)])
            new_short_degree_list = []
            for i in rotation_indices:
                new_short_degree_list.append(short_degree_list[i])
            short_degree_list = new_short_degree_list
            counter += 1
            
            # This is here to stop an infinite loop happening - if this seems to be happening, it just takes the middle value from the list
            
            if counter > 19:
                short_degree_list = [short_degree_list[int(len(short_degree_list)/2)]]
        
        # Rotates the original image by the degree determined to be optimal above
        
        desired_rotation = short_degree_list[0]
        rotated_pot = skimage.transform.rotate(pot_array, angle=desired_rotation, resize=True, center=None, preserve_range=True, order=0)
        
        # If the image has been rotated by close to 90 degrees clockwise or anticlockwise (usually as a result of a line
        # along the top of a vessel being longer than the one down the middle), it is rotated by an increment of 90 degrees
        # to reorient it to being upright
        
        based_on_centre_line = True
        if desired_rotation > 80:
            rotated_pot = skimage.transform.rotate(rotated_pot, angle=-90, resize=True, center=None, preserve_range=True, order=0)
            desired_rotation -= 90
            based_on_centre_line = False
        if desired_rotation < -80:
            rotated_pot = skimage.transform.rotate(rotated_pot, angle=90, resize=True, center=None, preserve_range=True, order=0)
            desired_rotation += 90
            based_on_centre_line = False
        
        # The array is trimmed to remove excess black space, and appended to the output list. Print 
        # statements show progress and the extent to which the arrays have been rotated. If all the
        # values are within a small range, it may be worth reusing this function on the original data
        # with a tighter degree_range and smaller degree_increment for more accurate outputs
        
        rotated_pot = rotated_pot[np.ix_(rotated_pot.any(1),rotated_pot.any(0))]
        rotated_pot_list.append(rotated_pot)
        print ('Pot_%s rotated by %s degrees' % (x, -desired_rotation), '(Centre line = %s,' % (based_on_centre_line), '%s additional iterations)' % (counter))
        
    # After all the images are rotated, the original image arrays in the data frame
    # are replaced with the rotated versions and the data frame is saved as a .pkl file or func_output_df
    # Because the input might not have name data attached, unique pkl output is impossible without provided names
    
    data_frame['Image Array'] = rotated_pot_list
    if output_pkl is True:
        if nomen is None:
            print ("NO .pkl PRODUCED - NOMEN REQUIRED FOR .pkl OUTPUT")
        else:
            data_frame.to_pickle('rotated_%s.pkl' % nomen)
    return data_frame

##########################
# mirror_and_trim_centre #
##########################
# A function to take fully rotated pot image arrays where the left hand side of the image is a cross-section,
# and the right hand side depicts exterior details, and output a complete cross-section based on the left hand
# side by mirroring it and erasing the central vertical line of the image. Outputs may still contain extraneous
# horizontal lines and details which will need to be dealt with later. It is probably best to manually identify
# pots prior to applying this function, as it erases much of the detail that helps with identification.

# Parameters:
#        input_pickle : A data frame with the arrays requiring processing stored in the 'Image Array' column
# centre_slice_pixels : The width in pixels of the rightmost slice of the image to be removed, processed and replaced 
#                       to remove the centre line. For narrow top/bottomed vessels, this value may need to be smaller
#  centre_line_pixels : The number of pixels each row of the centre_line_slice needs to exceed in order for that row 
#                       not to be turned to black. Images with thick or irregular central lines may need higher values.
#                       This Value should not exceed the centre_line_pixels value

def mirror_and_trim_centre_all(input_data, centre_slice_pixels=10, centre_line_pixels=4, output_pkl=False, nomen=None):
    
    # Reads the data frame from the input .pkl file and creates an empty list for outputs,
    # then iterates through each image array to carry out the rest of the function.
    # If Designations are present, these are used for the later print progress statements, 
    # otherwise generic index based designations are generated and used.
    
    if type(input_data) == str:
        data_frame = pd.read_pickle(input_pickle)
    elif isinstance(input_data,pd.DataFrame):
        data_frame = input_data
    index_list = list(data_frame.index.values)
    mirrored_pot_list = []
    for i in index_list:
        pot_array = data_frame.at[i, 'Image Array']
        pot_name = 'Pot_%s' %i
        
        # Makes a list of column values and determines the column in the centre quarter of the
        # with the highest value (most likely the centre line) then takes a slice of the image
        # array of all the columns up to and including the centre line from the left hand side
        # and saves it as a new array. 
        
        column_values = list(pot_array.sum(axis=0))
        centre_focus = (int(len(column_values)*3/8))
        column_values[:centre_focus] = [0]*centre_focus
        column_values[-centre_focus:] = [0]*centre_focus
        centre_col = max(column_values)
        lhs_pot = pot_array[:,:column_values.index(centre_col)+1]

        # Takes a slice of the rightmost x (determined by the centre_slice_pixels parameter) pixels
        # of the lhs image (i.e. the centre of the pot),calculates the value of each row of this
        # slice, then makes a list of all the indices where the only pixels represented are likely
        # to be those of the centre line (based on the centre_line_pixels parameter).
        
        centre_slice = lhs_pot[:,-centre_slice_pixels:]
        row_values = list(centre_slice.sum(axis=1))
        bad_row_indices = [i for i, value in enumerate(row_values) if value <= 255*centre_line_pixels]
        
        # Replaces all white pixels on indices determined as probably only containing the centre line
        # with black pixels on the centre slice array, then trims the rightmost x pixels (as above)
        # from the lhs array and concatenates it with the centre slice to form a complete lhs image.
        
        centre_slice[np.array(bad_row_indices), :] = 0
        trimmed_lhs_pot = lhs_pot[:,:-centre_slice_pixels]
        lhs_pot = np.concatenate((trimmed_lhs_pot, centre_slice),1)
        
        # Creates a new array by mirroring the left-hand side of the pot on the 
        # y axis, then concatenates the two arrays into a single output array
        
        lhs_mirror = np.flip(lhs_pot, 1)
        output_pot = np.concatenate((lhs_pot, lhs_mirror),1)
        
        # This section now makes sure that there aren't any rogue pixels left in the middle of the image.
        # These are usually a result of irregularities in the centre line erasion. It does so using
        # labelled arrays in a similar way to the initial pot scan. If there are >2 labels (background
        # and the body of the pot), the smaller additional labels will be erased, if there are only 2
        # labels, the function will continue as there are no irregularities, and if there are < 2 labels,
        # something has gone horribly wrong. If lots of correction is needed, consider tweaking the
        # function inputs or ensuring that the pot has been properly rotated.  The function then trims the
        # image to remove any blank space around the edges, and afterwards the mirrored image is appended
        # to the output list
        
        s = [[255,255,255],
             [255,255,255],
             [255,255,255]]
        labelled_array, num_features = scipy.ndimage.measurements.label(output_pot, structure=s,)
        unique_labels, pix_per_label = np.unique(labelled_array, return_counts=True)
        if len(unique_labels) == 2:
            print ("%s: No corrections" % pot_name)
        elif len(unique_labels) < 2:
            print ("%s: This shouldn't happen - something's gone wrong!" % pot_name)
        else:
            unique_labels = tuple(unique_labels)
            pix_per_label = tuple(pix_per_label)
            labels_with_pix = list(zip(unique_labels,pix_per_label))
            def getKey(item):
                return item[1]
            ordered_labels_with_pix = sorted(labels_with_pix, key=getKey, reverse=True)
            bad_labels = ordered_labels_with_pix[2:]
            for (x,y) in bad_labels:
                labelled_array[labelled_array == x] = 0
            labelled_array[labelled_array > 0] = 255
            output_pot = labelled_array
            print ("%s: %s corrections" % (pot_name, len(unique_labels)-2))
        output_pot = output_pot[np.ix_(output_pot.any(1),output_pot.any(0))]
        mirrored_pot_list.append(output_pot)
    
    # After all the images are mirrored, the original image arrays in the data frame
    # are replaced with the mirrored versions and the data frame is saved as a .pkl file
    
    data_frame['Image Array'] = mirrored_pot_list
    if output_pkl is True:
        if nomen is None:
            print ("NO .pkl PRODUCED - NOMEN REQUIRED FOR .pkl OUTPUT")
        else:
            data_frame.to_pickle('mirrored_%s.pkl' % nomen)
    return data_frame   


###########################
# internal_gap_remove_all #
###########################
# A function to remove any gaps within the cross-section walls of the vessel image. Probably best to apply
# this function after horizontal line shave been erased, as otherwise it may interpret the space between
# unwanted horizontal lines and the vessel walls as internal space to be deleted. For vessels with handles
# or internal subdivisions, ensure that the min_pixel_group_size value is sufficiently small so as not to
# misinterpret the spaces between handles and vessel walls as internal spaces to be erased.

# Parameters:
#         input_pickle : A data frame with the arrays requiring processing stored in the 'Image Array' column
# min_pixel_group_size : The minimum size a group of pixels must be to avoid erasure. Pixel groups larger than
#                        this value are assumed to be the interior volume of the vessel, the pixels surrounding
#                        the outer wall of the vessel, and any other desired 'blank spaces'(subdivisions of the
#                        interior volume, the space between handle and ouside wall etc.).



#PUT IN A THING TO EXEMPT POTS THAT WE DON'T WANT MESSED WITH





def internal_gap_removal_all(input_data, min_pixel_group_size = 50, exempt_pots = [], output_pkl=False, nomen=None):
    
    # Reads the data frame from the input .pkl file and creates an empty list for outputs,
    # then iterates through each image array to carry out the rest of the function.
    # If Designations are present, these are used for the later print progress
    # statements, otherwise generic index based designations are generated and used.
    
    if type(input_data) == str:
        data_frame = pd.read_pickle(input_pickle)
    elif isinstance(input_data,pd.DataFrame):
        data_frame = input_data
    index_list = list(data_frame.index.values)
    int_gap_removed_list = []
    for i in index_list:
        pot_array = data_frame.at[i, 'Image Array']
        pot_name = 'Pot_%s' %i
        
        # Converts the array to an image, adds a 10 pixel black border, inverts the colours,
        # then converts it back to an array. Adding the border ensures that all the pixels
        # surrounding the outer wall of the pot form one coherent group, thus avoiding the
        # undesired erasure of pixel groups bordering the edge of the image. By inverting the
        # image, the single group of pixels forming the body of the pot become black, 0 value
        # 'background' pixels, while the non-pot pixels, including the unwanted gaps within 
        # the walls of the vessel as well as the interior and exterior space of the vessel 
        # become white, 255 value 'foreground' pixels ready to have labelling applied to them.
        
        pot_array = Image.fromarray(pot_array)
        pot_array = pot_array.convert('L')
        pot_array = ImageOps.expand(pot_array, border = 10, fill = 'black')
        pot_array = ImageOps.invert(pot_array)
        pot_array = np.array(pot_array)
        
        # Labels all the non-zero pixel groups (i.e every pixel group other than the main body
        # of the pot) with unique postive integer values. Given there may be several hundred of
        # these groups, label 255 is reassigned to be label -1 to avoid its inclusion in the output
        
        labelled_array, num_features = scipy.ndimage.measurements.label(pot_array)
        labelled_array = np.where(labelled_array == 255, -1, labelled_array)
        
        # Creates a list of tuples containing the integer assigned to each label, and the number of
        # pixels that make up each label. An empty list is then created and filled with the integers
        # assigned to all non-zero labels deemed sufficiently large as to not be unwanted internal
        # space, as determind by the min_pixel_group_size parameter
        
        unique_labels, pixels_per_label = np.unique(labelled_array, return_counts=True)                                         
        unique_labels = tuple(unique_labels)
        pixels_per_label = tuple(pixels_per_label)
        labels_with_pixels = list(zip(unique_labels,pixels_per_label))
        good_labels = []
        for (x,y) in labels_with_pixels:
            if x != 0 and y >= min_pixel_group_size:
                good_labels.append(x)
        
        # Determines the number of labels that are going to be erased by subtracting the number of
        # sufficiently large labels (+ the 0 label) from the total number of unique labels so that
        # this value can be printed later on.
        
        number_of_erased_labels = len(unique_labels) - (len(good_labels)+1)
        
        # Iterates over the sufficiently large labels and preserves them by turning
        # them to 255/white. All other labels (which should just be unwanted space  
        # within the vessel walls) are then removed by turning them to 0/black.
        
        for label in good_labels:
            labelled_array = np.where(labelled_array == label, 255, labelled_array)
        labelled_array[labelled_array > 255] = 0
        labelled_array[labelled_array < 255] = 0
        
        # The array is converted to an image, inverted again (so that the vessel outline returns to
        # being 255/white), and cropped to remove the border added earlier. The image is then converted
        # back to an array before being appended to the output list.
        
        output_array = Image.fromarray(labelled_array)
        output_array = output_array.convert('L')
        output_array = ImageOps.invert(output_array)
        output_array = np.array(output_array)
        output_array = output_array[np.ix_(output_array.any(1),output_array.any(0))]
        int_gap_removed_list.append(output_array)
        
        # Print statements detail how many labels were erased in each image. If this value is unexpectedly
        # >0 (i.e. if labels are removed from a pot with no spaces within the vessel wall), consider lowering
        # the min_pixel_group_size parameter, as it is likely that desired spaces like those between the handle
        # and exterior wall of a vessel have been erroneously erased. If >0 labels have been erased, the function
        # will print the size in pixels of the largest deleted label. If this value is unexpectedly large,
        # it may indicate that a desired space has been erroneously deleted. This value can also be used to
        # tighten the min_pixel_group_size parameter for repeated uses of the function.
        
        print ("%s: %s labels removed" %(pot_name, number_of_erased_labels))
        if number_of_erased_labels > 0:
            bad_pixel_size_list = []
            for (x,y) in labels_with_pixels:
                if x not in good_labels and x != 0:
                    bad_pixel_size_list.append(y)
            bad_pixel_size_list = sorted(bad_pixel_size_list, reverse = True)
            print ('Largest deletion: %s pixels' % bad_pixel_size_list[0])
    
    # After all the arrays are processed, the original image arrays in the data frame
    # are replaced with the processed versions and the data frame is saved as a .pkl file
    
    data_frame['Image Array'] = int_gap_removed_list
    if output_pkl is True:
        if nomen is None:
            print ("NO .pkl PRODUCED - NOMEN REQUIRED FOR .pkl OUTPUT")
        else:
            data_frame.to_pickle('intgapped_%s.pkl' % nomen)
    return data_frame       


##################
# area_labelling #
##################
# A function to label fully processed image_arrays so that their measurements and shape characteristics can be
# determined. The output labels are as follows:
# 0 : The body of the vessel
# 1 : The interior of the vessel
# 2 : The exterior of the vessel
# 3 : If present in the image, any additional spaces that cannot be certainly identified as interior or exterior.
#     These areas are typically enclosed areas between handles and vessel exterior walls, or errors resulting
#     from the survival of unidentified interior lines or unidentified spaces within the vessel walls
# This array is saved to the pickle file under a new 'Labelled Array' column. The function also produces a version
# of this array with the values multiplied by 85 and saves it to the pickle file under a new 'Labelled Display Array'
# column. The arrays in this column can be converted into human-readable greyscale .pngs to check for labelling errors.
# The colours and pixel values of the display .pngs are as follows:
#     0 (body) : Black (0)
# 1 (interior) : Dark Grey (85)
# 2 (exterior) : Light Grey (170)
#    3 (other) : White (255)

# Parameters:
# input_data : A data frame with the arrays to be labelled stored in the 'Image Array' column
# hatched_interior: A True/False statement. If True, it assumes the scanned vessel does not have a fill;ed in interior (ie is outline only or hatched interior)
#                   and then assumes that any interior gaps found need to be filled in. !REQUIRES CAREFUL PREPROCESSING!
# est_int_height: The estimated height of the interior. usually 1/3 is fine (i.e. a third of the way down from the top), but for some vessel types like lids, a lower value may be needed

def area_labelling_all(input_data, hatched_interior=False, est_int_height=(1/3), output_pkl=False, nomen=None):
    
    # Reads the data frame from the input .pkl file and creates empty lists for outputs
    # then iterates through each image array to carry out the rest of the function.
    # If Designations are present, these are used for the later print progress
    # statements, otherwise generic index based designations are generated and used.
    #
    # Now with a setting do deal with hatched vessels - need to make sure in preprocessing that there are no actual gaps
    
    if type(input_data) == str:
        data_frame = pd.read_pickle(input_pickle)
    elif isinstance(input_data,pd.DataFrame):
        data_frame = input_data
    index_list = list(data_frame.index.values)
    labelled_pot_list = []
    for i in index_list:
        pot_array = data_frame.at[i, 'Image Array']
        pot_name = 'Pot_%s' %i
        
        # Converts the array to an image and inverts it so that the later labelling picks up
        # groups of non vessel body pixels, then converts it back to an array.
        
        pot_array = Image.fromarray(pot_array)
        pot_array = pot_array.convert('L')
        pot_array = ImageOps.invert(pot_array)
        pot_array = np.array(pot_array)
        
        # Labels the unique non-zero pixel groups of the array with unique positive integers
        # then determines the index of a pixel approximately 2/3 of the way up the centre of
        # the pot. It is assumed that this label represents the main interior section of the
        # vessel (for this reason lids, or other vessels with open bottoms and closed tops
        # are best labelled while upside down, at least until I can figure out a better way
        # of doing this) so the values for this label are all turned to -1 in a new output_array.
        # If the selected pixel forms part of the vessel body, pixels in higher rows of the
        # same column are tested until a non-body-pixel is discovered
        #
        # Now uses a variable rather than assuming a 2/3 of the way up height (though thats still the default)

        s = [[255,255,255],
             [255,255,255],
             [255,255,255]]
        labelled_array, num_features = scipy.ndimage.measurements.label(pot_array, structure=s)
        (row_number,col_number) = np.shape(pot_array)
        max_row_index = row_number - 1
        max_col_index = col_number - 1
        prob_int_index = (int(max_row_index*(est_int_height)),int(max_col_index/2))
        
        if labelled_array[prob_int_index] != 0:
            interior_label = labelled_array[prob_int_index]
            output_array = np.where(labelled_array == interior_label, -1, labelled_array)
        else:
            counter = 1
            while labelled_array[prob_int_index] == 0 and counter < max_row_index:
                prob_int_index = (int((max_row_index*(1/3))-counter),int(max_col_index/2))
                counter += 1
                print (counter)
            interior_label = labelled_array[prob_int_index]
            output_array = np.where(labelled_array == interior_label, -1, labelled_array)
        
        # Creates a list containing all the unique labels on the output array, then iterates over all
        # the values >0 so as to leave out the vessel interior (-1) and vessel wall (0). For each value
        # it generates a list of all the indices where that value is located, then iterates over this list
        # to see if any of the indices form part of the outside edge of the array. If any do, the label is
        # determined to be totally outside of the vessel, its values are turned to -2, and the index list
        # for that label stops being iterated over. Once this process is complete, any remaining positive
        # labels must be neither the main interior of the vessel, or the exterior, (these are likely to be
        # spaces such as the gap between a vessel wall and a handle), and their values are turned to -3.
        
        unique_labels, pixels_per_label = np.unique(output_array, return_counts=True)
        for label in unique_labels:
            if label > 0:
                (row_index,col_index) = np.where(output_array==label)
                row_index = tuple(row_index)
                col_index = tuple(col_index)
                rows_and_cols_indices = list(zip(row_index,col_index))
                for (row,col) in rows_and_cols_indices:
                    if row == 0 or row == max_row_index or col == 0 or col == max_col_index:
                        output_array = np.where(output_array == label, -2, output_array)
                        break
        output_array = np.where(output_array > 0, -3, output_array)
        
        # If the vessels being labelled have a crosshatched rather than fully inked in interior, and the
        # hatched interior value has been set to True, any internal space is assumed to be body, and changed
        # accordingly
        
        if hatched_interior == True:
            output_array = np.where(output_array == -3, 0, output_array)
        
        
        # The output array is multiplied by -1 to make all 4 labelled groups positive, and the output
        # arrays are appended to an output list.
        
        output_array = output_array*-1
        labelled_pot_list.append(output_array)
        print ('%s labelled' % pot_name)
        
    # The labelled arrays are added to the data frame as new columns, the old image arrays are deleted
    # to save space, and the dataframe is saved as a new pickle file.
    
    data_frame['Labelled Array'] = labelled_pot_list
    data_frame = data_frame.drop('Image Array', axis=1)
    if output_pkl is True:
        if nomen is None:
            print ("NO .pkl PRODUCED - NOMEN REQUIRED FOR .pkl OUTPUT")
        else:
            data_frame.to_pickle('labelled_%s.pkl' % nomen)
    return data_frame 


    
#########################
# output_display_arrays #
#########################

# A function to output display versions of the labelled arrays in a .pkl file as .pngs.
# Originally these arrays were saved on the .pkl, but to save space these are now only 
# produced when required.
#
# Parameters:
# input_pickle : A data frame from which to take the desired image arrays


def output_display_arrays(input_data, nomen=None):
    
    # Reads the data from the .pkl file and creates a folder to contain the .pngs
    
    if type(input_data) == str:
        data_frame = pd.read_pickle(input_data)
    elif isinstance(input_data,pd.DataFrame):
        data_frame = input_data
    folder_name = nomen+'-Display_Arrays'
    os.mkdir(folder_name)
    os.chdir(folder_name)
    
    # Iterates over every array on the dataframe, multiplies the array by 85 to
    # make the labells more visually distinct, then outputs them as .png files.
    # If the dataframe contains a Designation column, the .png filenames are derived
    # from this, with the index added to avoid duplications of pots erroneously given the same designation, otherwise they are derived from the index values.
    
    index_list = list(data_frame.index.values)
    for i in index_list:
        pot_array = data_frame.at[i, 'Labelled Array']
        display_array = pot_array*85
        output_image = Image.fromarray(display_array)
        output_image = output_image.convert('L')
        if 'Designation' in data_frame.columns:
            pot_name = data_frame.at[i, 'Designation']
            output_image.save('%s.png' %(str(pot_name)+"_at_index_"+str(i)))
        else:
            output_image.save('pot_at_index_%s.png' %str(i).zfill(5))
    os.chdir("..")   
    
    

        
    
    
    
    
def the_lot_2(input_dir_list, get_measures=False, horiz_remove=False):
    
    for filename in input_dir_list:
        nomen = Path(filename).stem
        print (nomen)
        print ("SCANNING IMAGE")
        scan_output_df = pot_scan_to_pkl(input_page=filename, threshold=None, diagonal_pixel_groups=True, viable_pot_size=6000, output_pkl=False, nomen=nomen)
        print ("AUTOROTATING")
        rotate_output_df = new_autorotate_all(input_data=scan_output_df, degree_range=2,degree_increment=1/8, ignore_horizontals=False, output_pkl=False, nomen=nomen)
        print ("MIRRORING")
        mirrored_output_df = mirror_and_trim_centre_all(input_data=rotate_output_df, centre_slice_pixels=10, centre_line_pixels=6, output_pkl=False, nomen=nomen)
        print ("GAP REMOVAL")
        intgapped_output_df = internal_gap_removal_all(input_data=mirrored_output_df, output_pkl=False, nomen=nomen)
        if horiz_remove is True:
            #print ("HORIZONTAL LINE REMOVAL")
            #horiz_output_df = ####
            #print ("LABELLING")
            #labelled_output_df = ####
            print ("JUST DON'T DO IT")
        else:
            print ("LABELLING")
            labelled_output_df = area_labelling_all(input_data=intgapped_output_df, hatched_interior=True, est_int_height=1/3,output_pkl=True, nomen=nomen)
        print ("PRODUCING PNG OUTPUT")
        output_display_arrays(input_data=labelled_output_df, nomen=nomen)
        print()

        
        
        
        
        
def add_designations(input_pickle, designation_list=[], concat_pickle=None, ignore_index=True):
    
    data_frame = pd.read_pickle(input_pickle)
    data_frame['Designation'] = designation_list
    
    print (data_frame)
    
    
    if concat_pickle != None:
        concat_data_frame = pd.read_pickle(concat_pickle)
        output_pickle = pd.concat([concat_data_frame,data_frame], ignore_index=ignore_index)
        print (output_pickle)
        output_pickle.to_pickle('Concat_%s.pkl' % concat_pickle[:-4])
    
    #saves bonus data_frame anyway
    
    data_frame.to_pickle('Designated_%s.pkl' % input_pickle[9:-4])
    
    #pkl_to_png('Concat_%s.pkl' % concat_pickle[:-4], output_column = 'Labelled Display Array', use_designations=True)
    if concat_pickle != None:
        output_display_arrays('Concat_%s.pkl' % concat_pickle[:-4])
    elif concat_pickle == None:
        output_display_arrays(data_frame, nomen=str(input_pickle[9:-4]))
        
        

################
# get_measures #
################
# A function to derive a series of measures from a pickle file containing a labelled array

# Parameters:
# input_pickle : A data frame containing labelled arrays from which measures can be derived stored in the 'Labelled Array' column
#  output_name : A string to save the final output as. If undefined, the pickle is saved as 'FINAL_(input_pickle).pkl'

def get_measures(input_pickle, output_name=None):
    
    # Opens the data frame and creates an index list for later iteration, as well 
    # as empty lists to fill with different arrays to obtain different measures
    
    data_frame = pd.read_pickle(input_pickle)
    index_list = list(data_frame.index.values)
    
    body_array_list = []
    interior_array_list = []
    sil_array_list = []
    
    print ("PKL FILE UNPACKED")
    print ()
    
    # Iterates over each entry in the data_frame, creating three arrays from the initial labelled
    # array, which are appended to a list and then saved as new columns on the data frame:
    #       Body Array : containing only pixels that form the body of the vessel
    #   Interior Array : containing only pixels that form the largest interior space in the vessel
    # Silhouette Array : containing both the pixels of the vessel body and primary interior space
    
    for i in index_list:
        body_array = data_frame.at[i, 'Labelled Array']
        body_array = np.where(body_array == 0, 1, 0)
        body_array_list.append(body_array)
        
        interior_array = data_frame.at[i, 'Labelled Array']
        interior_array = np.where(interior_array == 1, 1, 0)
        interior_array_list.append(interior_array)
        
        sil_array = data_frame.at[i, 'Labelled Array']
        sil_array = np.where(sil_array == 0, 1, sil_array)
        sil_array = np.where(sil_array == 1, 1, 0)
        sil_array_list.append(sil_array)
        
        print("Secondary Array Gen: %s of %s" % (i+1,len(index_list)))
        
    data_frame['Body Array'] = body_array_list
    data_frame['Interior Array'] = interior_array_list
    data_frame['Silhouette Array'] = sil_array_list
    
    print ()
    print ("ALL SECONDARY ARRAYS GENERATED")
    print ()
    
    # Creates a series of lists to fill with values derived from the above 
    # created arrays, then iterates over the data frame to obtain these values.
    
    body_height_list = []
    body_width_list = []
    body_area_list = []
    body_centroid_list = []
    body_perimeter_list = []
    
    interior_height_list = []
    interior_width_list = []
    interior_area_list = []
    interior_centroid_list_r_int = []
    interior_centroid_list_r_bod = []
    interior_perimeter_list = []
    
    sil_area_list = []
    sil_centroid_list = []
    sil_perimeter_list = []
    
    for i in index_list:
        
        # The body array is read off the data frame, and the array is trimmed to the tightest possible
        # rectangle around the upright pot. As a result, the height and width of the pot body (in pixels)
        # == the height and width of the array, values that can be easily obtained using the inbuilt
        # numpy.shape function. The area of the pot body (in pixels) can be obtained by counting the
        # number of nonzero pixels. The coordinates of the centroid can be calculated using a scipy
        # function. Because the pot arrays have been horixontally mirrored around the centre, the
        # horizontal centroid should always be at the vertical centre line, and is not really worth recording.
        
        body_array = data_frame.at[i, 'Body Array']
        body_array = body_array[np.ix_(body_array.any(1),body_array.any(0))]
        (body_height, body_width) = body_array.shape
        body_area = np.count_nonzero(body_array)
        (body_v_centroid, body_h_centroid) = scipy.ndimage.measurements.center_of_mass(body_array)
        
        # Caluclating the perimeter is slightly more complicated: The array is padded with a border of zero-value
        # pixels, a perimeter counter is set up, starting at 0, and then the indicies of non-zero pixels are recorded
        # in a list. This list is then iterated over and for each zero-value pixel non-diagonally adjacent to a body
        # pixel, the perimeter count is increased by one
        
        padded_body_array = np.pad(body_array, pad_width=1, mode='constant')
        body_perimeter = 0
        (body_rows,body_cols) = np.nonzero(padded_body_array)
        body_rows = tuple(body_rows)
        body_cols = tuple(body_cols)
        body_indices = list(zip(body_rows,body_cols))
        for (x,y) in body_indices:
            if padded_body_array[x+1,y] == 0:
                body_perimeter += 1
            if padded_body_array[x-1,y] == 0:
                body_perimeter += 1
            if padded_body_array[x,y+1] == 0:
                body_perimeter += 1
            if padded_body_array[x,y-1] == 0:
                body_perimeter += 1
        
        # The above determined values are then appended to lists
        
        body_height_list.append(body_height)
        body_width_list.append(body_width)
        body_area_list.append(body_area)
        body_centroid_list.append(body_v_centroid)
        body_perimeter_list.append(body_perimeter) 
        
        # Interior measures are calculated in a largely similar fashion. The only difference is that the vertical centroid
        # is calculated twice. First it is calculated in the context of the height of the whole vessel before the array is
        # cropped to just a rectangle around the interior pixels. Then it is calculated again after the image has been cropped
        # in the context of the height of the interior only. While both values represent the same point (the interior vertical
        # centroid), the reindexing that occurs following cropping means that two different centroid values are necessary for
        # comparisson with the height of the whole vessel (inlcuding base/feet) and that of the interior alone
        
        interior_array = data_frame.at[i, 'Interior Array']
        
        (interior_v_centroid_r_bod, interior_h_centroid_r_bod) = scipy.ndimage.measurements.center_of_mass(interior_array)
        
        trimmed_interior_array = interior_array[np.ix_(interior_array.any(1),interior_array.any(0))]
        
        (interior_height, interior_width) = trimmed_interior_array.shape
        interior_area = np.count_nonzero(trimmed_interior_array)
        (interior_v_centroid_r_int, interior_h_centroid_r_int) = scipy.ndimage.measurements.center_of_mass(trimmed_interior_array)
        
        padded_interior_array = np.pad(trimmed_interior_array, pad_width=1, mode='constant')
        interior_perimeter = 0
        (interior_rows,interior_cols) = np.nonzero(padded_interior_array)
        interior_rows = tuple(interior_rows)
        interior_cols = tuple(interior_cols)
        interior_indices = list(zip(interior_rows,interior_cols))
        for (x,y) in interior_indices:
            if padded_interior_array[x+1,y] == 0:
                interior_perimeter += 1
            if padded_interior_array[x-1,y] == 0:
                interior_perimeter += 1
            if padded_interior_array[x,y+1] == 0:
                interior_perimeter += 1
            if padded_interior_array[x,y-1] == 0:
                interior_perimeter += 1        
        
        interior_height_list.append(interior_height)
        interior_width_list.append(interior_width)
        interior_area_list.append(interior_area)
        interior_centroid_list_r_bod.append(interior_v_centroid_r_bod)
        interior_centroid_list_r_int.append(interior_v_centroid_r_int)
        interior_perimeter_list.append(interior_perimeter)
        
        # The silhouette measures are obtained in a similar fashion to the body measures. Height
        # and width measures are not necessary as these will be the same as body height and width
        
        sil_array = data_frame.at[i, 'Silhouette Array']
        sil_array = sil_array[np.ix_(sil_array.any(1),sil_array.any(0))]
        sil_area = np.count_nonzero(sil_array)
        (sil_v_centroid, sil_h_centroid) = scipy.ndimage.measurements.center_of_mass(sil_array)
        
        padded_sil_array = np.pad(sil_array, pad_width=1, mode='constant')
        sil_perimeter = 0
        (sil_rows, sil_cols) = np.nonzero(padded_sil_array)
        sil_rows = tuple(sil_rows)
        sil_cols = tuple(sil_cols)
        sil_indices = list(zip(sil_rows,sil_cols))
        for (x,y) in sil_indices:
            if padded_sil_array[x+1,y] == 0:
                sil_perimeter += 1
            if padded_sil_array[x-1,y] == 0:
                sil_perimeter += 1
            if padded_sil_array[x,y+1] == 0:
                sil_perimeter += 1
            if padded_sil_array[x,y-1] == 0:
                sil_perimeter += 1                    
        
        sil_area_list.append(sil_area)
        sil_perimeter_list.append(sil_perimeter)
        sil_centroid_list.append(sil_v_centroid)
        
        print("Initial Measuring: %s of %s" % (i+1,len(index_list)))
        
    # The lists complied above are then added to the dataframe
    
    data_frame['Body Height'] = body_height_list
    data_frame['Body Width'] = body_width_list
    data_frame['Body Area'] = body_area_list
    data_frame['Body Centroid'] = body_centroid_list
    data_frame['Body Perimeter'] = body_perimeter_list
    
    data_frame['Interior Height'] = interior_height_list
    data_frame['Interior Width'] = interior_width_list
    data_frame['Interior Area'] = interior_area_list
    data_frame['Interior Centroid Relating To Body Height'] = interior_centroid_list_r_bod
    data_frame['Interior Centroid Relating To Interior Height'] = interior_centroid_list_r_int
    data_frame['Interior Perimeter'] = interior_perimeter_list
    
    data_frame['Silhouette Centroid'] = sil_centroid_list
    data_frame['Silhouette Area'] = sil_area_list
    data_frame['Silhouette Perimeter'] = sil_perimeter_list
    
    print ()
    print ("ALL INITAL MEASURES TAKEN")
    print ()
    
    # The below measures are then derived from the values calculated above,
    # and added to the data frame as new columns. The data frame is then saved
    # as a pickle file under a new name based on the name of the input pickle 
    
    body_width_over_height_list = []
    body_centroid_over_height_list = []
    body_rectangularity_list = []
    body_circularity_list = []
    
    interior_width_over_height_list = []
    interior_centroid_over_interior_height_list = []
    interior_centroid_over_body_height_list = []
    interior_rectangularity_list = []
    interior_circularity_list = []
    
    sil_centroid_over_height_list = []
    sil_rectangularity_list = []
    sil_circularity_list = []

    for i in index_list:
        
        body_width = data_frame.at[i, 'Body Width']
        body_height = data_frame.at[i, 'Body Height']
        body_width_over_height = body_width/body_height
        body_width_over_height_list.append(body_width_over_height)
        
        body_centroid = data_frame.at[i, 'Body Centroid']
        body_centroid_over_height = body_centroid/body_height
        body_centroid_over_height_list.append(body_centroid_over_height)
        
        body_area = data_frame.at[i, 'Body Area']
        body_rectangularity = body_area/(body_height*body_width)
        body_rectangularity_list.append(body_rectangularity)
        
        body_perimeter = data_frame.at[i, 'Body Perimeter']
        body_circularity = (4*math.pi*body_area)/(body_perimeter*body_perimeter)
        body_circularity_list.append(body_circularity)
        
        interior_width = data_frame.at[i, 'Interior Width']
        interior_height = data_frame.at[i, 'Interior Height']
        interior_width_over_height = interior_width/interior_height
        interior_width_over_height_list.append(interior_width_over_height)
        
        interior_centroid_r_int = data_frame.at[i, 'Interior Centroid Relating To Interior Height']
        interior_centroid_over_interior_height = interior_centroid_r_int/interior_height
        interior_centroid_over_interior_height_list.append(interior_centroid_over_interior_height)
        
        interior_centroid_r_bod = data_frame.at[i, 'Interior Centroid Relating To Body Height']
        interior_centroid_over_body_height = interior_centroid_r_bod/body_height
        interior_centroid_over_body_height_list.append(interior_centroid_over_body_height)
        
        interior_area = data_frame.at[i, 'Interior Area']
        interior_rectangularity = interior_area/(interior_height*interior_width)
        interior_rectangularity_list.append(interior_rectangularity)

        interior_perimeter = data_frame.at[i, 'Interior Perimeter']
        interior_circularity = (4*math.pi*interior_area)/(interior_perimeter*interior_perimeter)
        interior_circularity_list.append(interior_circularity)
        
        sil_centroid = data_frame.at[i, 'Silhouette Centroid']
        sil_centroid_over_height = sil_centroid/body_height
        sil_centroid_over_height_list.append(sil_centroid_over_height)
        
        sil_area = data_frame.at[i, 'Silhouette Area']
        sil_rectangularity = sil_area/(body_height*body_width)
        sil_rectangularity_list.append(sil_rectangularity)
        
        sil_perimeter = data_frame.at[i, 'Silhouette Perimeter']
        sil_circularity = (4*math.pi*sil_area)/(sil_perimeter*sil_perimeter)
        sil_circularity_list.append(sil_circularity)
        
        print("Calculating Measures: %s of %s" % (i+1,len(index_list)))
        
    data_frame['Body Width/Height'] = body_width_over_height_list
    data_frame['Body Centroid/Height'] = body_centroid_over_height_list
    data_frame['Body Rectangularity'] = body_rectangularity_list
    data_frame['Body Circularity'] = body_circularity_list

    data_frame['Interior Width/Height'] = interior_width_over_height_list
    data_frame['Interior Centroid/Interior Height'] = interior_centroid_over_interior_height_list
    data_frame['Interior Centroid/Body Height'] = interior_centroid_over_body_height_list
    data_frame['Interior Rectangularity'] = interior_rectangularity_list
    data_frame['Interior Circularity'] = interior_circularity_list   
    
    data_frame['Silhouette Centroid/Height'] = sil_centroid_over_height_list
    data_frame['Silhouette Rectangularity'] = sil_rectangularity_list
    data_frame['Silhouette Circularity'] = sil_circularity_list
    
    print ("ALL CALCULATED MEASURES DERIVED")
    
    # Drops the created image arrays to save space
    
    data_frame = data_frame.drop(['Body Array', 'Interior Array', 'Silhouette Array'], axis=1)
    print (data_frame)
    
    if output_name == None:
        data_frame.to_pickle('FINAL_%s' % (input_pickle))
    else:
        if str(output_name[-4:]) != '.pkl':
            output_name += '.pkl'
        data_frame.to_pickle(output_name)
    

