# PART 1
### DataFrame

In [191]:
import pandas as pd
import os
from PIL import Image
import numpy as np

In [192]:
images = os.listdir('images')
dir = '/Users/erykw/kaggle/amazon/images'
image_files = [os.path.join(dir, f) for f in os.listdir(dir)]

#### Let's take a brief look of the file name structure.

In [193]:
[x for x in images]

['stock-photo-adult-and-child-hands-holding-red-heart-heart-health-donation-happy-volunteer-charity-csr-1487364161.jpg',
 'stock-photo-closeup-view-of-woman-with-drawn-flowers-on-green-chalkboard-space-for-text-teacher-s-day-1508523470.jpg',
 'stock-photo-suicide-prevention-and-childhood-cancer-awareness-yellow-ribbon-for-supporting-people-living-and-1482160559.jpg',
 'stock-photo-smart-city-and-abstract-dot-point-connect-with-gradient-line-and-aesthetic-intricate-wave-line-1499306735.jpg',
 'stock-photo-young-joyful-casual-family-of-two-kids-and-couple-sitting-on-sofa-and-watching-funny-video-or-1499295359.jpg',
 'stock-photo-sky-background-on-sunset-nature-abstract-composition-1484678003.jpg',
 'stock-photo-architect-team-having-a-discussion-1380294050.jpg',
 'stock-photo-october-breast-cancer-awareness-month-woman-in-pink-t-shirt-with-hand-holding-pink-ribbon-for-1505793707.jpg',
 'stock-photo-happy-family-mother-father-children-son-and-daughter-on-nature-on-sunset-1438256546.jpg',


#### Let's try extracting the information on an example first.

In [194]:
ex = images[0]

In [195]:
f1 = ex.split("-", 2)[:2][0]
f2 = ex.split("-", 2)[:2][1]
print(f1)
print(f2)

stock
photo


In [196]:
filename = f1 + '-' + f2
print(filename)

stock-photo


In [197]:
desc_ex = ex.split('-', 2)[2]
print(desc_ex)

adult-and-child-hands-holding-red-heart-heart-health-donation-happy-volunteer-charity-csr-1487364161.jpg


In [198]:
desc_ex_list = desc_ex.split("-")[:-1]
print(desc_ex_list)

['adult', 'and', 'child', 'hands', 'holding', 'red', 'heart', 'heart', 'health', 'donation', 'happy', 'volunteer', 'charity', 'csr']


In [199]:
description_ex = ' '.join(desc_ex_list)
print(description_ex)

adult and child hands holding red heart heart health donation happy volunteer charity csr


In [200]:
image_id_ex = desc_ex.split('-')[-1].split('.')[0]
print(image_id_ex)

1487364161


In [201]:
print('File Name:', filename)
print('Description:', description_ex)
print('Image id:', image_id_ex)
print(f'Full string: {filename}-{description_ex}-{image_id_ex}')

File Name: stock-photo
Description: adult and child hands holding red heart heart health donation happy volunteer charity csr
Image id: 1487364161
Full string: stock-photo-adult and child hands holding red heart heart health donation happy volunteer charity csr-1487364161


#### Seems like I got what I needed on the example string. Let's iterate over every picture we have.
##### While extracting the filename, description and image id, we don't need to iterate over actual pictures, just their names.
##### When we get to extracting information about actual pictures (like width, height, average color etc.), we need to iterate over actual files.

In [202]:
filenames = []
descriptions = []
image_ids = []

for image in images:
    f1 = image.split("-", 2)[:2][0]
    f2 = image.split("-", 2)[:2][1]
    filename = f1 + '-' + f2
    filenames.append(filename)
    
    desc_list = image.split('-', 2)[2].split('-')[:-1]
    description = ' '.join(desc_list)
    descriptions.append(description)
    
    image_id = image.split('-')[-1].split('.')[0]
    image_ids.append(image_id)

In [203]:
image_folder = 'images'
widths = []
heights = []

for image_path in image_files:
        im = Image.open(image_path)
        w, h = im.size

        widths.append(w)
        heights.append(h)

#### Let's create a DataFrame and store the values we have so far.

In [204]:
df = pd.DataFrame(
    {
        'file_name': filenames,
        'description': descriptions,
        'image_id': image_ids,
        'width': widths,
        'height': heights
    }
)

In [205]:
df.head()

Unnamed: 0,file_name,description,image_id,width,height
0,stock-photo,adult and child hands holding red heart heart ...,1487364161,1500,1101
1,stock-photo,closeup view of woman with drawn flowers on gr...,1508523470,1500,1101
2,stock-photo,suicide prevention and childhood cancer awaren...,1482160559,1500,1101
3,stock-photo,smart city and abstract dot point connect with...,1499306735,1500,724
4,stock-photo,young joyful casual family of two kids and cou...,1499295359,1500,1101


### Now it gets trickier. We need to work on image files and pixels.

#### I need to get 'average color'. Let's google that.
"The average colour is the sum of all pixels divided by the number of pixels."
Tricky part is that we're operating on RGB values, meaning that each pixel has its red, green and blue value. So we need to get a sum of all those.

In [206]:
def get_average_color(image_path):
    im = Image.open(image_path)
    pixels = list(im.getdata())

    total_r = total_g = total_b = 0
    total_pixels = len(pixels)

    for pixel in pixels:
        r, g, b = pixel
        total_r += r
        total_g += g
        total_b += b
        
    avg_r = total_r // total_pixels
    avg_g = total_g // total_pixels
    avg_b = total_b // total_pixels

    return (avg_r, avg_g, avg_b)

In [207]:
avg_colors = []

for image_path in image_files:
    avg_colors.append(get_average_color(image_path))

In [208]:
print(avg_colors)

[(156, 164, 158), (35, 72, 56), (207, 200, 189), (52, 63, 83), (184, 165, 150), (127, 165, 187), (105, 96, 89), (191, 137, 156), (173, 153, 121), (79, 97, 104), (161, 170, 188), (47, 64, 81), (60, 61, 61), (150, 159, 172), (90, 102, 109), (82, 89, 102), (32, 44, 53), (217, 170, 172), (208, 156, 33), (65, 68, 116)]


#### Looks like it worked.
I've decided to transform the RGB values to a hex. Just because it's a single value - in my opinion easier to google.

In [209]:
def rgb_to_hex(rgb):
    r, g, b = rgb
    return f'#{r:02x}{g:02x}{b:02x}'

In [210]:
hexes = []
for color in avg_colors:
    hexes.append(rgb_to_hex(color))

In [211]:
print(hexes)

['#9ca49e', '#234838', '#cfc8bd', '#343f53', '#b8a596', '#7fa5bb', '#696059', '#bf899c', '#ad9979', '#4f6168', '#a1aabc', '#2f4051', '#3c3d3d', '#969fac', '#5a666d', '#525966', '#202c35', '#d9aaac', '#d09c21', '#414474']


### Next 3 items are:
Median brightness after conversion to shades of gray

Horizontal coordinate of the brightest pixel after converting to grayscale

Vertical coordinate of brightest pixel after converting to grayscale

What is brightness of a grey picture?

Grayscale images are represented with only one pixel value instead of three, as it was in RGB. The brightness scale is in range (0, 255). 0 means black, 255 means white. Everything in between is a shade of gray. So we need to find the median pixel value.

In [212]:
def get_brightness(image_path):
    im = Image.open(image_path).convert('L')
    pixels = np.array(im)
    sorted_pixels = np.sort(pixels.flatten())
    median_brightness = np.median(sorted_pixels)
    
    return median_brightness

In [213]:
greyscale_brightness = []
for image_path in image_files:
    greyscale_brightness.append(get_brightness(image_path))    

In [214]:
df['avg_color'] = hexes
df['median_greyscale_brightness'] = greyscale_brightness

In [215]:
df.head()

Unnamed: 0,file_name,description,image_id,width,height,avg_color,median_greyscale_brightness
0,stock-photo,adult and child hands holding red heart heart ...,1487364161,1500,1101,#9ca49e,194.0
1,stock-photo,closeup view of woman with drawn flowers on gr...,1508523470,1500,1101,#234838,52.0
2,stock-photo,suicide prevention and childhood cancer awaren...,1482160559,1500,1101,#cfc8bd,236.0
3,stock-photo,smart city and abstract dot point connect with...,1499306735,1500,724,#343f53,48.0
4,stock-photo,young joyful casual family of two kids and cou...,1499295359,1500,1101,#b8a596,186.0


#### Last items is to find the coordinates of the brightest pixels.
We will just iterate over width and height of the image to get the brightest pixel.

In [216]:
def get_brightest_pixel(image_path):
    im = Image.open(image_path).convert('L')
    width, height = im.size
    
    brightest_intensity = -1
    brightest_pixel = None
    
    for x in range(width):
        for y in range(height):
            pixel_value = im.getpixel((x, y))
            if pixel_value > brightest_intensity:
                brightest_intensity = pixel_value
                brightest_pixel = (x, y)
            
    return brightest_pixel

In [217]:
brightest_pixels = []
for image_path in image_files:
    brightest_pixels.append(get_brightest_pixel(image_path))

In [218]:
horizontal_brightest_pixel, vertical_brightest_pixel = zip(*brightest_pixels)

In [219]:
df['horizontal_brightest_px'] = horizontal_brightest_pixel
df['vertical_brightest_px'] = vertical_brightest_pixel

In [220]:
df.head()

Unnamed: 0,file_name,description,image_id,width,height,avg_color,median_greyscale_brightness,horizontal_brightest_px,vertical_brightest_px
0,stock-photo,adult and child hands holding red heart heart ...,1487364161,1500,1101,#9ca49e,194.0,54,1068
1,stock-photo,closeup view of woman with drawn flowers on gr...,1508523470,1500,1101,#234838,52.0,54,1068
2,stock-photo,suicide prevention and childhood cancer awaren...,1482160559,1500,1101,#cfc8bd,236.0,0,4
3,stock-photo,smart city and abstract dot point connect with...,1499306735,1500,724,#343f53,48.0,0,593
4,stock-photo,young joyful casual family of two kids and cou...,1499295359,1500,1101,#b8a596,186.0,14,170


# PART 2
### Aggregation of images

The second task is to sort the photos into subfolders. You need to sort the images by median brightness, divide them into bins of count 4, and save the images into subfolders with the naming scheme - [bin number]-images, numbering the bins from 1.

In [221]:
import shutil

#### Sorting the images by median brightness

In [222]:
df_sorted = df.sort_values(by=['median_greyscale_brightness'])
df_sorted = df_sorted.reset_index().drop(columns=['index'])

In [223]:
df_sorted

Unnamed: 0,file_name,description,image_id,width,height,avg_color,median_greyscale_brightness,horizontal_brightest_px,vertical_brightest_px
0,stock-photo,top view of beautiful young woman sleeping coz...,1427337869,1500,945,#202c35,35.0,54,912
1,stock-photo,nice attractive stylish cheerful cheery positi...,1436812790,1500,1101,#414474,46.0,54,1068
2,stock-photo,view of moon limb with earth rising on the hor...,1454730908,1500,1101,#3c3d3d,46.0,54,1068
3,stock-photo,zombie rising out of a graveyard cemetery in s...,1492357433,1500,1101,#2f4051,46.0,54,1068
4,stock-photo,smart city and abstract dot point connect with...,1499306735,1500,724,#343f53,48.0,0,593
5,stock-photo,closeup view of woman with drawn flowers on gr...,1508523470,1500,1101,#234838,52.0,54,1068
6,stock-photo,hurricane dorian in the carribean sea on its w...,1492317566,1500,1101,#525966,61.0,0,778
7,stock-photo,nature shots for the pros,1506854450,1500,1101,#5a666d,77.0,54,1068
8,stock-photo,architect team having a discussion,1380294050,1500,945,#696059,77.0,54,912
9,stock-photo,skeleton zombie hands rising out of a cemetery...,1495866737,1500,790,#4f6168,86.0,54,757


#### Creating 5 directories for each bin

In [224]:
for bin in range(1, 6):
    path = f'{bin}-images'
    os.mkdir(path)
    print(bin, 'created')

1 created
2 created
3 created
4 created
5 created


#### Now we need think about the logic to move correct files to each dir.
I'm thinking that accessing the image_id, and then comparing it to the image_path string is gonna work.

We need 4 images per bin. Let's try that without moving the files first.

In [225]:
df_sorted['image_id'].loc[:3]

0    1427337869
1    1436812790
2    1454730908
3    1492357433
Name: image_id, dtype: object

In [226]:
for image_path in image_files:
    for line in df_sorted['image_id'].loc[:3]:
        if line in image_path:
            print(image_path)

/Users/erykw/kaggle/amazon/images/stock-photo-zombie-rising-out-of-a-graveyard-cemetery-in-spooky-dark-night-full-moon-holiday-event-halloween-1492357433.jpg
/Users/erykw/kaggle/amazon/images/stock-photo-view-of-moon-limb-with-earth-rising-on-the-horizon-footprints-as-an-evidence-of-people-being-there-1454730908.jpg
/Users/erykw/kaggle/amazon/images/stock-photo-top-view-of-beautiful-young-woman-sleeping-cozily-on-a-bed-in-his-bedroom-at-night-blue-nightly-1427337869.jpg
/Users/erykw/kaggle/amazon/images/stock-photo-nice-attractive-stylish-cheerful-cheery-positive-carefree-guys-ladies-having-fun-hanging-out-best-1436812790.jpg


#### Okay, this will do.
Now, let's create the directories.

In [227]:
for i in range(5):
    print(f'{os.getcwd()}/{i+1}-images')

/Users/erykw/kaggle/amazon/1-images
/Users/erykw/kaggle/amazon/2-images
/Users/erykw/kaggle/amazon/3-images
/Users/erykw/kaggle/amazon/4-images
/Users/erykw/kaggle/amazon/5-images


#### Last step is to come up with how to move the files. Let's try using the .loc method of the dataframe.

In [228]:
for i in range(5):
    start_index = i * 4
    end_index = min(start_index + 3, len(df_sorted))
    
    for image_path in image_files:
        for line in df_sorted['image_id'].loc[start_index:end_index]:
            if line in image_path:
                shutil.copy2(image_path, f'{os.getcwd()}/{i+1}-images')

#### Looks like it worked! I compared the files visually with the df_sorted and they're in correct bins.

#### !!! I could have just manually created 5 separate statements writing [:3], [4:7], [8:11], [12:15], [16:19], but if there were more images to move, then it would've been a really tedious job.

In [229]:
%%sh
tree

[01;34m.[0m
├── [01;34m1-images[0m
│   ├── [01;35mstock-photo-nice-attractive-stylish-cheerful-cheery-positive-carefree-guys-ladies-having-fun-hanging-out-best-1436812790.jpg[0m
│   ├── [01;35mstock-photo-top-view-of-beautiful-young-woman-sleeping-cozily-on-a-bed-in-his-bedroom-at-night-blue-nightly-1427337869.jpg[0m
│   ├── [01;35mstock-photo-view-of-moon-limb-with-earth-rising-on-the-horizon-footprints-as-an-evidence-of-people-being-there-1454730908.jpg[0m
│   └── [01;35mstock-photo-zombie-rising-out-of-a-graveyard-cemetery-in-spooky-dark-night-full-moon-holiday-event-halloween-1492357433.jpg[0m
├── [01;34m2-images[0m
│   ├── [01;35mstock-photo-closeup-view-of-woman-with-drawn-flowers-on-green-chalkboard-space-for-text-teacher-s-day-1508523470.jpg[0m
│   ├── [01;35mstock-photo-hurricane-dorian-in-the-carribean-sea-on-its-way-to-us-mainland-in-august-elements-of-this-1492317566.jpg[0m
│   ├── [01;35mstock-photo-nature-shots-for-the-pros-1506854450.jpg[0m
│   └── [