In [1]:
# Data Collection

!pip install kaggle --upgrade
!pip3 install Pillow

import kaggle
import os
import json
import random
from datetime import datetime
from PIL import Image
from PIL.ExifTags import TAGS

def get_colors(image_file, numcolors=3, resize=150, output = []):
    # Resize image to speed up processing
    img = Image.open(image_file)
    img = img.copy()
    img.thumbnail((resize, resize))
 
    # Reduce to palette
    paletted = img.convert('P', palette=Image.ADAPTIVE, colors=numcolors)
 
    # Find dominant colors
    palette = paletted.getpalette()
    color_counts = sorted(paletted.getcolors(), reverse=True)
    colors = list()
    for i in range(numcolors):
        palette_index = color_counts[i][1]
        dominant_color = palette[palette_index*3:palette_index*3+3]
        colors.append(tuple(dominant_color))
    return colors



In [None]:
## You have to collect and download a set of images. You have the following tasks to program, automating the process as much as possible:

### 1. Create a folder called images.

!mkdir -p ./images
!mkdir -p ./metadata

### 2. Download open-licensed images to the folder images (minimum 100 images).

kaggle.api.authenticate()
kaggle.api.dataset_download_files('nielspace/pexels-mountain-images', path='./images', unzip=True)    
    
### 3. Save metadata of every image like image size, image format (.jpeg, .png, etc.), image orientation (landscape, portrait, square, etc.), creation date, camera model, etc. in one or more JSON files. You can make use of the Exif information present in the image files.

directory = './images/Mountain'
metadata = {}
orientation = ""

for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        path, file_extension = os.path.splitext(f)
        time_creation = os.path.getmtime(f)
        size = Image.open(f).size
        if size[0]>size[1] :
            orientation = "landscape"
        elif size[0] == size[1] :
            orientation = "square"
        else :
            orientation = "portrait"
        metadata[filename] = {
            "file_extension": file_extension,
            "creation_date": datetime.fromtimestamp(time_creation/1000.0).strftime("%m/%d/%Y, %H:%M:%S"),
            "size": size,
            "orientation": orientation,
            "colors": get_colors(directory + "/" + filename)
            }

with open("./metadata/metadata.json", "w") as outfile:
    json.dump(metadata, outfile, indent=4)

        

In [None]:
# Labeling and Annotation

directory = './images/Mountain'
data_path = "./metadata/metadata.json"



if os.path.isfile(data_path):
    with open(data_path) as target:
        json_data = json.load(target)
        
for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        tags = {}
        tags["like"] = random.randint(0,100)
        tags["hashtag"] = "moutain"
        json_data[filename]["tags"] = tags
        
with open("./metadata/metadata.json", "w") as outfile:
    json.dump(json_data, outfile, indent=4)

In [None]:
from sklearn import tree
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier


# Data Analyses

numb_users = 1
directory = './images/Mountain'
data_path = "./metadata/metadata.json"
rand = 0

# Metadata for each file
if os.path.isfile(data_path):
    with open(data_path) as target:
        json_data = json.load(target)

# Creating data for each user
json_data_users = {}        
for i in range (0, numb_users):
    images_per_users_rand = []
    images_per_users_orientation = []
    images_per_users_colors = []
    tags_per_users = []
    rand = random.randint(1, 4)
    if rand == 1:
        tags_per_users.append(["like", "colors"])
    elif rand == 2:
        tags_per_users.append("colors")
    elif rand == 3:
        tags_per_users.append(["hashtag", "colors"])
    else:
        tags_per_users.append(["like, hashtag", "colors"])
    for filename in os.listdir(directory) :
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            ## First solution to get images a user liked
            rand = random.randint(1, 5)
            if rand == 1 :
                images_per_users_rand.append(filename)
            ## Second solution to get images u user liked
            if (json_data[filename]["orientation"] == "landscape"):
                images_per_users_orientation.append(filename)
            ## Third solution to get images u user liked
            if (json_data[filename]["colors"][0][0] >= 155):
                images_per_users_colors.append(filename)
    json_data_users[i] = {
        "images_rand": images_per_users_rand,
        "images_orientation": images_per_users_orientation,
        "images_colors": images_per_users_colors,
        "tags": tags_per_users
    }
    

# Creating dataframes to predict what the user might like
data = []
result_rand = []
result_orientation = []
result_colors = []
json_data_learning = dict(list(json_data.items())[:len(json_data)//3])

for i in json_data_learning:
    if i in json_data_users[0]["images_rand"]:
        result_rand.append('Favorite')
    else:
        result_rand.append('NotFavorite')
    if i in json_data_users[0]["images_orientation"]:
        result_orientation.append('Favorite')
    else:
        result_orientation.append('NotFavorite')
    if i in json_data_users[0]["images_colors"]:
        result_colors.append('Favorite')
    else:
        result_colors.append('NotFavorite')
        
    data.append([json_data_learning[i]["colors"][0][0],
                 json_data_learning[i]["colors"][0][1],
                 json_data_learning[i]["colors"][0][2],
                 json_data_learning[i]["orientation"], 
                 json_data_learning[i]["size"][0],
                 json_data_learning[i]["size"][1]
                ])


    
dataframe = pd.DataFrame(data, columns=['Red', 'Green', 'Blue', 'Orientation', 'Width', 'Height'])
resultframe_rand = pd.DataFrame(result_rand, columns=['Liked'])
resultframe_orientation = pd.DataFrame(result_orientation, columns=['Liked'])
resultframe_colors = pd.DataFrame(result_colors, columns=['Liked'])

#print(dataframe)
#print(resultframe)

#generating numerical labels
le1 = LabelEncoder()
dataframe['Orientation'] = le1.fit_transform(dataframe['Orientation'])

le2 = LabelEncoder()
resultframe_rand['Liked'] = le2.fit_transform(resultframe_rand['Liked'])

le3 = LabelEncoder()
resultframe_orientation['Liked'] = le3.fit_transform(resultframe_orientation['Liked'])

le4 = LabelEncoder()
resultframe_colors['Liked'] = le4.fit_transform(resultframe_colors['Liked'])

# DTC
dtc_rand = tree.DecisionTreeClassifier()
dtc_rand = dtc_rand.fit(dataframe, resultframe_rand)

dtc_orientation = tree.DecisionTreeClassifier()
dtc_orientation = dtc_orientation.fit(dataframe, resultframe_orientation)

dtc_colors = tree.DecisionTreeClassifier()
dtc_colors = dtc_colors.fit(dataframe, resultframe_colors)

# RTC
rfc_rand = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0)
rfc_rand = rfc_rand.fit(dataframe, resultframe_rand.values.ravel())

rfc_orientation = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0)
rfc_orientation = rfc_orientation.fit(dataframe, resultframe_orientation.values.ravel())

rfc_colors = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0)
rfc_colors = rfc_colors.fit(dataframe, resultframe_colors.values.ravel())

# jsuis con faut faire un tableau ac les 33 trucs de fin
def get_prediction(method, method_object, label_encoder, json_data_predicting):
    if (method == "dtc"):
        prediction = method_object.predict([
            [json_data_predicting[j]["colors"][0][0],
             json_data_predicting[j]["colors"][0][1],
             json_data_predicting[j]["colors"][0][2],
             label_encoder.transform([json_data_predicting[j]["orientation"]])[0],
             json_data_predicting[j]["size"][0],
             json_data_predicting[j]["size"][1]
            ]]) 
    elif (method == "rfc"):
        prediction = method_object.predict(
            [
                [
                    json_data_predicting[j]["colors"][0][0],
                    json_data_predicting[j]["colors"][0][1],
                    json_data_predicting[j]["colors"][0][2],
                    label_encoder.transform([json_data_predicting[j]["orientation"]])[0],
                    json_data_predicting[j]["size"][0],
                    json_data_predicting[j]["size"][1]
                ]
            ]
        )
    return prediction

json_data_predicting = dict(list(json_data.items())[len(json_data)*2//3:])
json_result = {}
for j in json_data_predicting:
    prediction_rand_dtc = get_prediction('dtc', dtc_rand, le1, json_data_predicting)
    prediction_orientation_dtc = get_prediction('dtc', dtc_orientation, le1, json_data_predicting)
    prediction_colors_dtc = get_prediction('dtc', dtc_colors, le1, json_data_predicting)
    
    prediction_rand_rfc = get_prediction('rfc', rfc_rand, le1, json_data_predicting)
    prediction_orientation_rfc = get_prediction('rfc', rfc_orientation, le1, json_data_predicting)
    prediction_colors_rfc = get_prediction('rfc', rfc_colors, le1, json_data_predicting)
    
    json_result[j] = {
        "dtc":{
            "rand": le2.inverse_transform(prediction_rand_dtc)[0],
            "orientation": le3.inverse_transform(prediction_orientation_dtc)[0],
            "colors": le4.inverse_transform(prediction_colors_dtc)[0]
        },
        "rfc":{
            "rand": le2.inverse_transform(prediction_rand_rfc)[0],
            "orientation": le3.inverse_transform(prediction_orientation_rfc)[0],
            "colors": le4.inverse_transform(prediction_colors_rfc)[0]
        }

    }
    #print(le2.inverse_transform(prediction))
    #print(dtc.feature_importances_)
    
def get_array_for_df(image_name, json_data_users, array_res, json_result, type_test):
    if (json_result != None):
        if image_name in json_data_users[0]["images_" + type_test]:
            array_res.append([
                image_name,
                "Favorite",
                json_result[k]["dtc"][type_test],
                json_result[k]["rfc"][type_test]
            ])
        else: 
            array_res.append([
                image_name,
                "NotFavorite",
                json_result[k]["dtc"][type_test],
                json_result[k]["rfc"][type_test]
            ])
        return array_res
    else:
        if image_name in json_data_users[0]["images_" + type_test]:
            array_res.append([
                image_name,
                "Favorite",
                "N/A",
                "N/A"
            ])
        else: 
            array_res.append([
                image_name,
                "NotFavorite",
                "N/A",
                "N/A"
            ])
        return array_res  

def get_correct_count(ind, array_res, correct_count_dtc, correct_count_rfc):
    if (array_res[ind][1] == array_res[ind][2]):
        correct_count_dtc += 1
    elif (array_res[ind][1] == array_res[ind][3]):
        correct_count_rfc += 1
    return (correct_count_dtc, correct_count_rfc)
            
array_rand = []
array_orientation = []
array_colors = []
w = 0
correct_rand_dtc = 0
correct_rand_rfc = 0
correct_orientation_dtc = 0
correct_orientation_rfc = 0
correct_colors_dtc = 0
correct_colors_rfc = 0

for k in json_data:
    if k in json_result:
        array_rand = get_array_for_df(k, json_data_users, array_rand, json_result, 'rand')
        array_orientation = get_array_for_df(k, json_data_users, array_orientation, json_result, 'orientation')
        array_colors = get_array_for_df(k, json_data_users, array_colors, json_result, 'colors')
    else:
        array_rand = get_array_for_df(k, json_data_users, array_rand, None, 'rand')
        array_orientation = get_array_for_df(k, json_data_users, array_orientation, None, 'orientation')
        array_colors = get_array_for_df(k, json_data_users, array_colors, None, 'colors')
        
    (correct_rand_dtc, correct_rand_rfc) = get_correct_count(w, array_rand, correct_rand_dtc, correct_rand_rfc)
    (correct_orientation_dtc, correct_orientation_rfc) = get_correct_count(w, array_orientation, correct_orientation_dtc, correct_orientation_rfc)
    (correct_colors_dtc, correct_colors_rfc) = get_correct_count(w, array_colors, correct_colors_dtc, correct_colors_rfc)
    w += 1

percentage_correct_rand_dtc = correct_rand_dtc/(len(json_data)//3)
percentage_correct_rand_rfc = correct_rand_rfc/(len(json_data)//3)
print("Percentage rand dtc: " + str(percentage_correct_rand_dtc*100) + "%") 
print("Percentage rand rfc: " + str(percentage_correct_rand_rfc*100) + "%") 
percentage_correct_orientation_dtc = correct_orientation_dtc/(len(json_data)//3)
percentage_correct_orientation_rfc = correct_orientation_rfc/(len(json_data)//3)
print("Percentage orientation dtc : " + str(percentage_correct_orientation_dtc*100) + "%")    
print("Percentage orientation rfc: " + str(percentage_correct_orientation_rfc*100) + "%")    
percentage_correct_colors_dtc = correct_colors_dtc/(len(json_data)//3)
percentage_correct_colors_rfc = correct_colors_rfc/(len(json_data)//3)
print("Percentage colors dtc: " + str(percentage_correct_colors_dtc*100) + "%")    
print("Percentage colors rfc: " + str(percentage_correct_colors_rfc*100) + "%")    

dataframeRes_rand = pd.DataFrame(array_rand, columns=['Image Name', 'User Like Rand', 'Program Rand DTC', 'Program Rand RFC'])
dataframeRes_orientation = pd.DataFrame(array_orientation, columns=['Image Name', 'UserLikeOrient', 'ProgramOrient DTC', 'ProgramOrient RFC'])
dataframeRes_colors = pd.DataFrame(array_colors, columns=['Image Name', 'User Like Colors',  'Program Colors DTC', 'Program Colors RFC'])

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#    print(dataframeRes_orientation)

print(rfc_orientation.feature_importances_)

# une image => deux info : l'étiquette et un vecteur ac les infos de l'image cette tchoin
# étiquette 0 (aime pas) ou 1 (aime)
# find test split pr sklearn ça découpe les dataset automatiquement
# skleanr accuracy score pour ce que j'ai fait au desssus mais en mieux