# Project 

FAIN Thony  
LONCHAMBON Alexis  

## Install

### Install Commands

In [77]:
# !pip install tqdm
# !pip install time

# %pip install --force-reinstall -v "ipywidgets == 7.7.2"
# %pip install --force-reinstall -v "jupyterlab_widgets == 1.1.1"

### Imports

In [100]:
# ALL IMPORTS FOR CODE 

import os
import sys
import numpy as np
import pandas as pd
import time
import json
import math
import requests
import shutil
import PIL.Image
from types import SimpleNamespace
from PIL.ExifTags import TAGS
from IPython.display import display
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
from pandas import json_normalize
from IPython.display import Image, HTML
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plot
from sklearn.cluster import KMeans, MiniBatchKMeans

### Settings

In [115]:
## CLUSTERING

# Numbers of color clusters for classification
NUM_CLUSTERS = 3


## DATA

#Database names
DB_NAME = "db.json"
IMG_DB_NAME = "db_images.json"

#Image paths
IMG_FOLDER = "img"
PLT_FOLDER = "plt"


### Global Methods

In [124]:
# ALL GLOBAL FUNCITONS

def path_to_image_html(path):
    '''Transforms an url to an image balise for displaying'''
    return '<img width="500" src="'+ path + '"/>'

def format_exif(data):
    '''Formats exifs to HTML display'''
    out = ""
    for tag, value in data.items():
        if tag in TAGS:
            out+=f"{TAGS[tag]}: {value}<br>"
    return out

def format_exif_json(data):
    '''Formats exifs to a dict for JSON parsing'''

    if(not data) :
        return None
    
    #This creates a object flexible enough to add attributes dynamically
    out = SimpleNamespace()

    for tag, value in data.items():
        if tag in TAGS:
            tagS = TAGS[tag]

            #Some tags are ignored because they contain lots of useless bytes values that are a pain to format and cannot be used in our project anyway
            if tagS in ["MakerNote", "UserComment", "InterColorProfile"]:
                continue

            #Some string values contain empty
            if isinstance(value, str):
                value = value.rstrip('\x00').rstrip('\u0000')

            # add attribute to our object
            setattr(out,tagS, value)
    #We need to return it as a dict for JSON parsing
    return out.__dict__

def get_colors(path):
    '''Returns a plot with the colors'''
    if not os.path.exists(PLT_FOLDER):
        # Create a new directory because it does not exist
        os.makedirs(PLT_FOLDER)
    if not os.path.exists(f"{PLT_FOLDER}/{IMG_FOLDER}"):
        # Create a new directory because it does not exist
        os.makedirs(f"{PLT_FOLDER}/{IMG_FOLDER}")

    #Open image
    imgfile = PIL.Image.open(path).convert('RGBA')

    #We want a certain number of dominant colors
    numClusters = NUM_CLUSTERS

    try:
        plot.clf()

        # Resize to speed up image handling
        imgfile = imgfile.resize((512,512), PIL.Image.Resampling.LANCZOS)

        # Convert to 2D array
        imgfile = np.array(imgfile)
        w, h, d = tuple(imgfile.shape)
        image_array = np.reshape(imgfile, (w * h, d))

        # numarray = np.array(imgfile.getdata(), np.uint8)

        #Clustering with MiniBatchKmeans
        clusters = MiniBatchKMeans(n_clusters=numClusters, random_state=0, n_init=2, n_jobs=4)
        clusters.fit(image_array)
        npbins = np.arange(0, numClusters+1)
        histogram = np.histogram(clusters.labels_, bins=npbins)
        labels = np.unique(clusters.labels_)
        barlist = plot.bar(labels, histogram[0])
        for i in range(numClusters):
            barlist[i].set_color(
                "#%02x%02x%02x"
                % (
                    math.ceil(clusters.cluster_centers_[i][0]),
                    math.ceil(clusters.cluster_centers_[i][1]),
                    math.ceil(clusters.cluster_centers_[i][2]),
                )
            )
        plot.savefig(f"{PLT_FOLDER}/{path}")
        return clusters
    except Exception as inst:
        # print(f"RIP for {path} : {inst}") 
        return None
        
            
def download_image(url):
    '''Downloads the image from an url to the img path'''


    filepath = os.path.join(IMG_FOLDER, os.path.basename(url))

    #creates the directory to avoid a crash (I love python...)
    if not os.path.exists(IMG_FOLDER):
        # Create a new directory because it does not exist
        os.makedirs(IMG_FOLDER)
        # print("The new directory is created!")
    headers = {"User-Agent": "Mozilla/5.0"}

    #Ignore the download if the file exists
    if os.path.isfile(filepath) :
        return filepath

    #Download code
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        with open(filepath, "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
    return filepath

## Dataset Initialisation

### Getting the images and setting up the database

In [118]:

endpoint_url = "https://query.wikidata.org/sparql"
imgmax = 1000

# Get cities
query = """SELECT DISTINCT ?planeLabel ?entry ?image {
  ?plane wdt:P31 wd:Q15056993;
               wdt:P729 ?entry;
               wdt:P729 ?retirement;
               wdt:P18 ?image.
      
  SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
} LIMIT 1000"""

#get the results from the query from wikidata
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


#array for dataframe
array = []

#array for JSON formatting
db = []
results = get_results(endpoint_url, query)
res = results["results"]["bindings"]
i = 0

#Parsing all results 
for result in tqdm(res):
    i+=1

    #Weird formats are ignored.
    filename, file_extension = os.path.splitext(os.path.basename(result["image"]["value"]))
    if file_extension not in [".png", ".jpg"] :
        continue

    #Download and get image exif data
    path = download_image(result["image"]["value"])
    img = PIL.Image.open(path)
    exif_data = img._getexif()
    
    #Parse data for JSON DB
    db.append(
        {
            "name" : result["planeLabel"]["value"],
            "img" : path,
            "width" : img.width,
            "height" : img.height,
            "orientation" : ("Paysage" if img.width > img.height else "Portrait"),
            "tags" : format_exif_json(exif_data)
        }
    )

    #Parse data for dataframe display
    array.append(
        (
            result["planeLabel"]["value"],
            result["entry"]["value"],
            path,
            img.width,
            img.height,
            ("Paysage" if img.width > img.height else "Portrait"),
            exif_data
        )
    )


dataframe = pd.DataFrame(array, columns=["planeLabel", "entry", "image", "width", "height", "orientation", "data"])
dataframe = dataframe.astype(
    dtype={"planeLabel": "<U200", "entry" : "<U200", "image": "<U200", "width": "int64", "height": "int64", "orientation" : "<U200"}
)
# srt = dataframe.sort_values("data")


# Serializing json
json_object = json.dumps(db, indent=4, default=lambda o: f"{o}")
 
# Writing to db.json
with open(DB_NAME, "w") as outfile:
    outfile.write(json_object)

#HTML Display
pd.set_option('display.max_colwidth', 100)

# HTML(srt.to_html(escape=False ,formatters=dict(image=path_to_image_html)))

100%|██████████| 552/552 [00:00<00:00, 3819.79it/s]


### Filtering data

In [119]:
filter1 = dataframe["data"] != "None"
filtered = dataframe.where(filter1).dropna()
# HTML(filtered.to_html(escape=False ,formatters=dict(image=path_to_image_html)))
# filtered
mapped = filtered
# bars = get_colors(mapped["image"])
# mapped["bars"] = mapped['image'].apply(lambda x: get_colors(x))
# mapped["data"] = mapped['data'].apply(lambda x: format_exif(x))
# mapped
# HTML(mapped.to_html(escape=False ,formatters=dict(image=path_to_image_html)))
mapped


Unnamed: 0,planeLabel,entry,image,width,height,orientation,data
0,G.50 Freccia,1938-01-01T00:00:00Z,img/Fiat%20G50.jpg,600,336,Paysage,"{296: 3, 34665: 156, 305: 'Adobe Photoshop 7.0', 274: 1, 306: '2004:04:22 10:16:00', 282: 27.0, ..."
5,RC-135,1965-09-01T00:00:00Z,img/Boeing%20RC-135V%20Rivet%20Joint%2064-14842.jpg,1360,800,Paysage,"{296: 2, 34665: 1080, 270: 'The hog-nosed RC-135 reconnaissance aircraft, Rivet Joint, with its ..."
6,McDonnell Douglas F-15 Eagle,1976-01-09T00:00:00Z,img/McDonnell%20Douglas%20F-15C%20%28SN%2079-015%29%20in%20flight%20060905-F-1234S-018.jpg,1431,1800,Portrait,"{296: 2, 34665: 164, 305: 'Adobe Photoshop CS2 Windows', 274: 1, 306: '2007:05:16 10:53:06', 282..."
8,Piaggio P.180 Avanti,1990-01-29T00:00:00Z,img/Piaggio%20P-180%20Avanti%20Rennes%202010.jpg,2224,1483,Paysage,"{296: 2, 34665: 156, 271: 'NIKON CORPORATION', 272: 'NIKON D200', 305: 'GIMP 2.6.11', 282: 72.0,..."
9,A350 XWB,2015-01-15T00:00:00Z,img/Virgin%20Atlantic%20Airbus%20A350-1041XWB%20G-VLUX%20%28Red%20Velvet%29%20approaching%20JFK%...,2800,2100,Paysage,"{34853: {0: b'\x02\x03\x00\x00'}, 296: 2, 34665: 2414, 271: 'NIKON CORPORATION', 272: 'NIKON D35..."
...,...,...,...,...,...,...,...
481,Lavochkin La-9,1947-01-01T00:00:00Z,img/LavochkinLa-9.jpg,1560,1152,Paysage,"{296: 2, 34665: 186, 271: 'Canon', 272: 'Canon PowerShot A610', 274: 1, 306: '2006:04:15 15:36:5..."
484,E-Jet,2004-03-17T00:00:00Z,img/EI-RDB%20Embraer%20175%20Alitalia%20BCN.jpg,4722,3148,Paysage,"{256: 4722, 257: 3148, 258: (8, 8, 8), 262: 2, 296: 2, 34665: 296, 271: 'Canon', 272: 'Canon EOS..."
485,Winjeel,1955-01-01T00:00:00Z,img/CAC%20Winjeel.jpg,2048,1365,Paysage,"{296: 3, 34665: 244, 271: 'Canon', 272: 'Canon EOS 350D DIGITAL', 305: 'Paint Shop Pro Photo 11...."
486,Bell H-13 Sioux,1946-01-01T00:00:00Z,img/Bell%2047-OH-13%20inflight%20bw.jpg,1419,982,Paysage,"{34665: 2142, 274: 1, 306: '2014:08:24 15:42:14', 59932: b'\x1c\xea\x00\x00\x00\x08\x00\x00\x00\..."


## Dominant Color Annotation

### Load the parsed data

In [120]:
# Opening JSON file
f = open(DB_NAME)
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame(data)

df

Unnamed: 0,name,img,width,height,orientation,tags
0,G.50 Freccia,img/Fiat%20G50.jpg,600,336,Paysage,"{'ResolutionUnit': 3, 'ExifOffset': 156, 'Software': 'Adobe Photoshop 7.0', 'Orientation': 1, 'D..."
1,MiG-29,img/Russian%20Air%20Force%20Mikoyan-Gurevich%20MiG-29S%20Naumenko-1.jpg,1250,833,Paysage,
2,Beriev Be-200 Altair,img/MChS%20Beriev%20Be-200%20waterbomber.jpg,1024,673,Paysage,
3,Sud Aviation Caravelle,img/HB-ICZ%20Swissair%20Caravelle.jpg,3300,2200,Paysage,
4,Airbus A330 MRTT,img/Royal%20Air%20Force%20Airbus%20A330MRTT%20Bidini.jpg,1200,809,Paysage,
...,...,...,...,...,...,...
495,Short C-23 Sherpa,img/Short%20C-23A%20Sherpa%20%28330-200%29%2C%20USA%20-%20Air%20Force%20AN1538079.jpg,1024,678,Paysage,
496,Tomtit,img/Hawker%20Tomtit%20%E2%80%98K1786%E2%80%99%20%28G-AFTA%29%20%2844441841084%29.jpg,3773,2515,Paysage,"{'ImageWidth': 6016, 'ImageLength': 4000, 'BitsPerSample': [8, 8, 8], 'GPSInfo': {'0': 'b'\x02\x..."
497,Vildebeest,img/Vickers%20Vildebeest%20in%20flight.jpg,352,317,Paysage,
498,Savoia-Marchetti S.55,img/Aeroflot%20Savoia-Marchetti%20S.55P.jpg,1072,396,Paysage,


### Dominant Color annotation 

In [125]:
for entry in tqdm(data):
    path = entry["img"]
    clusters = get_colors(path)
    if clusters:
        i = 0
        colorlist = []

        for color in clusters.cluster_centers_:
            c = {}
            c["R"] = int(color[0])
            c["G"] = int(color[1])
            c["B"] = int(color[2])

            colorlist.append(c)
        entry["colors"] = colorlist

    


    # Serializing json
    json2 = json.dumps(data, indent=4, default=lambda o: f"{o}")
    
    # Writing to db_colors.json
    with open(IMG_DB_NAME, "w") as outfile:
        outfile.write(json2)

    #HTML Display settings
    pd.set_option('display.max_colwidth', 100)


 77%|███████▋  | 386/500 [00:26<00:07, 14.34it/s]


OSError: image file is truncated (12 bytes not processed)

<Figure size 640x480 with 0 Axes>

### Open Color-Tagged database

In [93]:
# Opening JSON file
f = open(IMG_DB_NAME)
  
# returns JSON object as 
# a dictionary
data = json.load(f)

df = pd.DataFrame(data)

df

Unnamed: 0,name,img,width,height,orientation,tags,colors
0,Mirage F1,img/Ecuadorian%20Air%20Force%20Dassault%20Mirage%20F1E.jpg,2800,1810,Paysage,,"[{'R': 212.6725781275991, 'G': 218.40189906054343, 'B': 227.7863004187112}, {'R': 46.40000066578..."
1,ATR 72,img/ATR%20ATR-72-202%2C%20LOT%20-%20Polish%20Airlines%20-%20Polskie%20Linie%20Lotnicze%20%28Euro...,1200,800,Paysage,,"[{'R': 130.87064254976002, 'G': 138.88626090010334, 'B': 143.6222122500597}, {'R': 73.4833412694..."
2,Boeing Vertol CH-47 Chinook,img/Boeing%20Vertol%20CH-47%20Chinook%203-view%20line%20drawing.png,574,385,Paysage,,
3,Il-2 Sturmovik,img/Il2%20sturmovik.jpg,650,234,Paysage,,
4,Mil Mi-1,img/Mi-1m%20museum.jpg,640,468,Paysage,,"[{'R': 154.30666708332754, 'G': 89.11665442727366, 'B': 69.90636083811303}, {'R': 211.5842953131..."
...,...,...,...,...,...,...,...
495,Short C-23 Sherpa,img/Short%20C-23A%20Sherpa%20%28330-200%29%2C%20USA%20-%20Air%20Force%20AN1538079.jpg,1024,678,Paysage,,
496,Tomtit,img/Hawker%20Tomtit%20%E2%80%98K1786%E2%80%99%20%28G-AFTA%29%20%2844441841084%29.jpg,3773,2515,Paysage,"{'ImageWidth': 6016, 'ImageLength': 4000, 'BitsPerSample': [8, 8, 8], 'GPSInfo': {'0': 'b'\x02\x...",
497,Vildebeest,img/Vickers%20Vildebeest%20in%20flight.jpg,352,317,Paysage,,
498,Savoia-Marchetti S.55,img/Aeroflot%20Savoia-Marchetti%20S.55P.jpg,1072,396,Paysage,,
