## This is a practice range for cleaning the data when Mom inputs a .csv file
- End goal:
    - Mom inputs a .csv file using drag and drop on streamlit dashboard
    - Sorts the plants into the categories
    - Instantly creates a plant list based on the QTY and ID columns
    - Populates each ID with pictures/captions and creates a word document for easy formatting
    - exports on streamlit to .docx or pdf or wutever

In [1]:
import numpy as np
import pandas as pd
import os
from PIL import Image

In [2]:
pd.set_option('display.max_rows', 100)

In [3]:
os.getcwd()

'c:\\Users\\dbawa\\ZZ__School_Related\\z_other\\Code\\Personal_Projects\\FlowerPower\\flower_sandbox'

In [4]:
df = pd.read_csv("../inputs/user_input/BARTLEY STEWART- Plant list JAN2023 copy for TEST.csv")

In [5]:
# Set the column names
df.columns = df.iloc[7]

In [6]:
# grab the name of the project for automatic naming
nameNFMT = df.loc[1, "Common Name"]
projectName = "The "+nameNFMT+" Project"

# Remove the first 5 rows while preserving column names
df = df.iloc[9:, :]
df = df.reset_index(drop=True)

In [7]:
# Drop all NaN only rows & columns
df = df.dropna(how='all')
df = df.dropna(axis=1, how='all')
df = df.reset_index(drop=True)

In [8]:
# iterate through the Qty row to get the plant group names and their index values
plant_groups_wIDX = {}
for index, value in df['Qty'].items():
    if not value.isnumeric():
        if value.startswith("ADDED after"):
            addedAfterIndex = index
            break
        else:
            plant_groups_wIDX[value] = index

In [9]:
# create a dictionary to hold the list of IDs for each group
plant_groups_wID = {key: [] for key in plant_groups_wIDX.keys()}

# converting dict keys into a list for index access
keys_list = list(plant_groups_wIDX.keys())

for i in range(len(keys_list)):
    # if not at the last element in keys_list
    if i != len(keys_list) - 1:
        # take IDs from the index of the current key till the index of the next key
        plant_groups_wID[keys_list[i]] = df['ID'].iloc[plant_groups_wIDX[keys_list[i]] + 1 : plant_groups_wIDX[keys_list[i+1]]].tolist()
    else:
        # for the last key, take IDs till the end of the dataframe
        plant_groups_wID[keys_list[i]] = df['ID'].iloc[plant_groups_wIDX[keys_list[i]] + 1 : addedAfterIndex].tolist()

In [10]:
df

7,NaN,Qty,ID,Botanical Name,Common Name,Spacing,Scheduled Size,Remarks
0,,Deciduous Trees,,,,,,
1,,Conifers,,,,,,
2,,2,CH OB G,Chamaecyparis obtusa 'Gracilis',Slender Hinoki False Cypress,,7 - 8',
3,,3,TH PL GG,Thuja plicata 'Green Giant',Green Giant Arborvitae,,8 - 10’ B&B,
4,,Shrubs,,,,,,
5,,1,CO AL IH,Cornus alba 'Ivory Halo',Ivory Halo Dogwood,,#5 pot,
6,*,2,FO MA MA,Fothergilla major ‘Mt Airy’,Mt. Airy Fothergilla,,#5 pot,
7,*,1,HA VA,Hamamelis Virginiana,Common Withhazel,,3-4' B&B,
8,,6,HY AR IB,Hydrangea arborescens 'Incrediball Blush',Incrediball® Blush Smooth Hydrangea,,#7 pot,
9,,6,HY PA LI,Hydrangea paniculata 'Limelight',Limelight Hardy Hydrangea,,#7 pot,


In [11]:
plant_groups_wIDX

{'Deciduous Trees': 0,
 'Conifers': 1,
 'Shrubs': 4,
 'Perennials': 21,
 'FERNS': 36,
 'Ornamental Grasses': 38}

In [12]:
plant_groups_wID

{'Deciduous Trees': [],
 'Conifers': ['CH OB G', 'TH PL GG'],
 'Shrubs': ['CO AL IH',
  'FO MA MA',
  'HA VA',
  'HY AR IB',
  'HY PA LI',
  'HY QU RS',
  'IL GL C',
  'IL GL C',
  'PI MF',
  'PI MF',
  'PI BB',
  'RH PW',
  'RH P',
  'RH MA RO',
  'VI PL S',
  'VI TR W'],
 'Perennials': ['Act B',
  'Dia M',
  'Ast VD',
  'Bru JF',
  'Dic e',
  'Ger R',
  'Hel B',
  'Hos G',
  'Hos BA',
  'Iri I',
  'Nep PB',
  'Pae IB',
  'Sed D',
  'Tia RT'],
 'FERNS': ['Dry B'],
 'Ornamental Grasses': ['Cal KF', 'Pen LB-1']}

In [13]:
# ensuring all the plant images are jpeg

image_dir = '../inputs/plant_images'
# Loop over the flower images
for filename in os.listdir(image_dir):
    # Check for .jfif file
    if filename.endswith('.jfif'):
        # Open the image
        img = Image.open(os.path.join(image_dir, filename))
        # Save the image as a .jpg file
        base_filename = os.path.splitext(filename)[0]  # Get filename without extension
        img.save(os.path.join(image_dir, base_filename + '.jpg'), 'JPEG') # convert to jpeg
        os.remove(os.path.join(image_dir, filename)) # delete .jfif after


In [14]:
# grab the file path of each image based on the lists in plant_groups_wID dict

# combine all lists into one for quick test
test_list_all = []
for i in keys_list:
    for item in plant_groups_wID[i]:
        test_list_all.append(item)

In [15]:
# check if any of of the file paths in the plant_images folder matches each item in test_list_all
has_match = []
matching_path = []
for ID in test_list_all:
    for filename in os.listdir(image_dir):
        if ID in filename:
            has_match.append(ID)
            matching_path.append(filename)


In [16]:
unique_matches = list(set(has_match))
len(unique_matches)

28

In [17]:
len(test_list_all)

35

In [18]:
def find_missing_values(list1, list2):
    missing_values = [value for value in list1 if value not in list2]
    return missing_values

In [19]:
# len(missing_values) != len(test_list_all) - len(unique_matches); this is b/c...
# unique_matches is missing one 'IL GL C' because it was a duplicate that had a file path match
# when list(set(has_match)) happens, it takes away all duplicates
missing_values = find_missing_values(test_list_all, unique_matches)
print(missing_values) 

['HA VA', 'HY PA LI', 'PI MF', 'PI MF', 'Dia M', 'Pen LB-1']


In [20]:
# TODO - ask mom about missing/misnamed ID values
#      - associate each item in test_list_all with a picture
#      - OR associate each using 4 different lists to keep goups seperate
#      - create docx file with each picutre/caption
#      - streamlit time for bruhx

In [21]:
has_match

['CH OB G',
 'CH OB G',
 'CH OB G',
 'TH PL GG',
 'TH PL GG',
 'CO AL IH',
 'FO MA MA',
 'HY AR IB',
 'HY AR IB',
 'HY AR IB',
 'HY QU RS',
 'HY QU RS',
 'IL GL C',
 'IL GL C',
 'IL GL C',
 'IL GL C',
 'PI BB',
 'RH PW',
 'RH P',
 'RH P',
 'RH P',
 'RH P',
 'RH P',
 'RH P',
 'RH P',
 'RH P',
 'RH MA RO',
 'RH MA RO',
 'VI PL S',
 'VI PL S',
 'VI PL S',
 'VI TR W',
 'VI TR W',
 'Act B',
 'Act B',
 'Ast VD',
 'Bru JF',
 'Dic e',
 'Ger R',
 'Hel B',
 'Hos G',
 'Hos G',
 'Hos G',
 'Hos G',
 'Hos BA',
 'Iri I',
 'Nep PB',
 'Pae IB',
 'Sed D',
 'Sed D',
 'Tia RT',
 'Tia RT',
 'Dry B',
 'Dry B',
 'Cal KF',
 'Cal KF']

In [22]:
unique_matches

['TH PL GG',
 'CO AL IH',
 'IL GL C',
 'Pae IB',
 'FO MA MA',
 'Bru JF',
 'VI TR W',
 'Ger R',
 'Hos G',
 'RH P',
 'Ast VD',
 'Hos BA',
 'HY QU RS',
 'RH PW',
 'Sed D',
 'PI BB',
 'Dry B',
 'RH MA RO',
 'VI PL S',
 'Act B',
 'Nep PB',
 'Dic e',
 'HY AR IB',
 'Cal KF',
 'Iri I',
 'CH OB G',
 'Tia RT',
 'Hel B']

In [23]:
test_list_all.sort()
test_list_all

['Act B',
 'Ast VD',
 'Bru JF',
 'CH OB G',
 'CO AL IH',
 'Cal KF',
 'Dia M',
 'Dic e',
 'Dry B',
 'FO MA MA',
 'Ger R',
 'HA VA',
 'HY AR IB',
 'HY PA LI',
 'HY QU RS',
 'Hel B',
 'Hos BA',
 'Hos G',
 'IL GL C',
 'IL GL C',
 'Iri I',
 'Nep PB',
 'PI BB',
 'PI MF',
 'PI MF',
 'Pae IB',
 'Pen LB-1',
 'RH MA RO',
 'RH P',
 'RH PW',
 'Sed D',
 'TH PL GG',
 'Tia RT',
 'VI PL S',
 'VI TR W']

In [25]:
# create a function that takes a list of plant IDs and returns a dictionary of the plant ID and its image path
def get_image_paths(list_of_plant_IDs):
    image_dir = '../inputs/plant_images'
    plant_IDs_wIMG = {}
    for ID in list_of_plant_IDs:
        for filename in os.listdir(image_dir):
            if ID in filename:
                plant_IDs_wIMG[ID] = os.path.join(image_dir, filename)
    return plant_IDs_wIMG

In [27]:
test_all_wPicture = get_image_paths(test_list_all)

In [28]:
len(test_all_wPicture)

28

In [29]:
test_all_wPicture

{'Act B': '../inputs/plant_images\\Act BN.jpg',
 'Ast VD': '../inputs/plant_images\\Ast VD.jpg',
 'Bru JF': '../inputs/plant_images\\Bru JF.jpg',
 'CH OB G': '../inputs/plant_images\\CH OB GF.jpg',
 'CO AL IH': '../inputs/plant_images\\CO AL IH.jpg',
 'Cal KF': '../inputs/plant_images\\Cal KF.jpg',
 'Dic e': '../inputs/plant_images\\Dic e.jpg',
 'Dry B': '../inputs/plant_images\\Dry B.jpg',
 'FO MA MA': '../inputs/plant_images\\FO MA MA.jpg',
 'Ger R': '../inputs/plant_images\\Ger R wall.jpg',
 'HY AR IB': '../inputs/plant_images\\HY AR IB.jpg',
 'HY QU RS': '../inputs/plant_images\\HY QU RS Ruby Slippers.jpeg',
 'Hel B': '../inputs/plant_images\\Hel B.jpg',
 'Hos BA': '../inputs/plant_images\\Hos BA.jpg',
 'Hos G': '../inputs/plant_images\\Hos GT.jpg',
 'IL GL C': '../inputs/plant_images\\IL GL C.png',
 'Iri I': '../inputs/plant_images\\Iri I.jpg',
 'Nep PB': '../inputs/plant_images\\Nep PB.jpg',
 'PI BB': '../inputs/plant_images\\PI BB.jpg',
 'Pae IB': '../inputs/plant_images\\Pae IB

In [44]:
from PIL import Image, ImageDraw, ImageFont

def create_collage(dictionary, collage_width, image_width):
    images = []
    max_height = 0

    for key in dictionary:
        img = Image.open(dictionary[key])
        wpercent = (image_width / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((image_width, hsize), Image.ANTIALIAS)
        
        draw = ImageDraw.Draw(img)
        text = key
        # Use a truetype font for better quality text
        font = ImageFont.truetype("arial.ttf", 15)
        draw.text((0, 0), text, font=font)  

        images.append(img)
        if img.height > max_height:
            max_height = img.height

    total_height = max_height * (len(images) // (collage_width // image_width) + 1)
    collage = Image.new('RGB', (collage_width, total_height))

    x_offset = 0
    y_offset = 0
    for img in images:
        collage.paste(img, (x_offset, y_offset))
        x_offset += img.width
        if x_offset >= collage_width:
            x_offset = 0
            y_offset += max_height

    collage.show()

In [45]:
create_collage(test_all_wPicture, 1000, 400)

  img = img.resize((image_width, hsize), Image.ANTIALIAS)


In [34]:
from docx import Document
from docx.shared import Inches

def create_docx(plant_images):
    # Create a new Word document
    document = Document()

    # Loop through the plant images dictionary
    for plant_id, image_path in plant_images.items():
        # Add a new paragraph to the document
        paragraph = document.add_paragraph()

        # Add the plant ID as a bolded caption
        caption = paragraph.add_run(f"{plant_id}: ")
        caption.bold = True

        # Add the image to the paragraph with the appropriate size
        image = document.add_picture(image_path, width=Inches(6))
        # Add a new line after the image
        paragraph.add_run("\n")

    # Save the document to a file
    document.save("test_output/plant_images.docx")

In [35]:
create_docx(test_all_wPicture)

error: bad char in struct format