# Image clustering

The goal of this notebook is to combine women dresses into big groups by their style with a help from crowd

We are going to use [Freidegger dataset](https://github.com/zalandoresearch/feidegger). The dataset consists of 8732 high-resolution images, each depicting a dress against a white-background.


To get acquainted with Toloka tools for free, you can use the promo code **TOLOKAKIT1** on $20 on your [profile page](https://toloka.yandex.com/requester/profile?utm_source=github&utm_medium=site&utm_campaign=tolokakit) after registration.

Prepare environment and import all we'll need.

In [None]:
!pip3 install toloka-kit==0.1.22
!pip3 install crowd-kit==0.0.9
!pip3 install pandas
!pip3 install ipyplot

import collections
import datetime
import json
import os
import random
import sys
import time
import logging

from decimal import Decimal
from time import sleep
from typing import Dict, List, Union

import ipyplot
import pandas as pd
import numpy as np

import toloka.client as toloka
import toloka.client.project.template_builder as tb
from crowdkit.aggregation import DawidSkene

logging.basicConfig(
    format='[%(levelname)s] %(name)s: %(message)s',
    level=logging.INFO,
    stream=sys.stdout,
)

Сreate toloka-client instance. All api calls will go through it. More about OAuth token in our [Learn the basics example](https://github.com/Toloka/toloka-kit/tree/main/examples/0.getting_started/0.learn_the_basics) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Toloka/toloka-kit/blob/main/examples/0.getting_started/0.learn_the_basics/learn_the_basics.ipynb)

In [None]:
toloka_client = toloka.TolokaClient(input("Enter your token:"), 'PRODUCTION')  # Or switch to 'SANDBOX'
logging.info(toloka_client.get_requester())

## Creating new project

create project

In [None]:
def create_project_from_file(filepath: str) -> toloka.Project:
    with open(filepath) as project_file:
        project_dict = json.load(project_file)
        project_dict['private_comment'] = 'mr-fedulow@ is working on clustering example'
    return toloka_client.create_project(project_dict)

project = create_project_from_file('training_exam_configs/project.json')



create training

In [None]:
def create_training_from_file(filepath: str, project_id: str) -> toloka.Training:
    with open(filepath) as training_file:
        training_dict = json.load(training_file)
    training = toloka.structure(training_dict, toloka.Training)
    training.project_id = project_id
    return toloka_client.create_training(training)

training = create_training_from_file('training_exam_configs/training.json', project.id)

In [None]:
training = pool

Upload tasks to training

In [None]:
def create_training_tasks_from_directory(path: str, training_id: str) -> List[toloka.Task]:
    tasks = []
    
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath) and filepath.endswith('.json'):
            with open(filepath) as task_file:
                task_dict = json.load(task_file)
            
            task = toloka.structure(task_dict, toloka.Task)
            task.pool_id = training_id
            
            tasks.append(task)
    
    return tasks

tasks = create_training_tasks_from_directory('training_exam_configs/training_tasks/', training.id)
tasks = toloka_client.create_tasks(tasks) #, open_pool=True)

Create an exam

In [None]:
def create_pool_from_file(filepath: str, project_id: str, training_id: str) -> toloka.Pool:
    with open(filepath) as pool_file:
        pool_dict = json.load(pool_file)
    pool = toloka.structure(pool_dict, toloka.Pool)
    pool.project_id = project_id
    pool.quality_control.training_requirement.training_pool_id = training_id
    pool.will_expire = datetime.datetime.now() + datetime.timedelta(days=7)
    return toloka_client.create_pool(pool)

pool = create_pool_from_file('training_exam_configs/exam_pool.json', project.id, training.id)

In [1]:
def create_task_suites_from_directory(path: str, pool_id: str) -> List[toloka.TaskSuite]:
    task_suits = []
    
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath) and filepath.endswith('.json'):
            with open(filepath) as task_file:
                task_suit_dict = json.load(task_file)
            
            task_suit = toloka.structure(task_suit_dict, toloka.Task)
            task_suit.pool_id = pool_id
            
            task_suits.append(task_suit)
    
    return task_suits

task_suites = create_task_suites_from_directory('training_exam_configs/exam_tasks/', pool.id)
task_suites = toloka_client.create_task_suites(task_suites) #, open_pool=True)

NameError: name 'List' is not defined

Create a new skill

In [None]:
skill = toloka_client.create_skill(name='Shoes clustering20220105')

We are going to run check

In [None]:
assignments_raw = toloka_client.get_assignments_df(pool_id=pool.id, status=['SUBMITTED'])
while len(assignments_raw) < 3:
    assignments_raw = toloka_client.get_assignments_df(pool_id=pool.id, status=['SUBMITTED'])
toloka_client.close_training(training.id)
toloka_client.close_pool(pool.id)

In [None]:
assignments_raw

In [None]:
def get_assignments_list(assignments: pd.DataFrame) -> list:
    '''Calculates clusters from answers and transforms DF to list'''
    assignments_list = list()
    for _, row in assignments.iterrows():
        ass_item = dict()
        output_list = json.loads(row['OUTPUT:result'])
        golden_list = json.loads(row['GOLDEN:result'])
        img_str = row['INPUT:images']
        ass_item['images'] = json.loads('[' + img_str + ']')
        ass_item['outputs'] = [x['group'] for x in output_list]
        ass_item['golden'] = [x['group'] for x in golden_list]
        ass_item['assignment_id'] = row['ASSIGNMENT:assignment_id']
        ass_item['worker_id'] = row['ASSIGNMENT:worker_id']
        dd = collections.defaultdict(list)
        for label, img in zip(ass_item['outputs'], ass_item['images']):
            dd[label].append(img)
        clusters = frozenset(frozenset(x) for x in dd.values())
        ass_item['clusters'] = clusters
        
        dd = collections.defaultdict(list)
        for label, img in zip(ass_item['golden'], ass_item['images']):
            dd[label].append(img)
        clusters = frozenset(frozenset(x) for x in dd.values())
        ass_item['golden_clusters'] = clusters
        ass_item['images'] = frozenset(ass_item['images'])
        assignments_list.append(ass_item)
    return assignments_list


In [None]:
total_users = 300

In [None]:
uskill = toloka_client.get_user_skills(skill_id=skill.id)
list(uskill)

In [None]:
from time import sleep

In [None]:
users_with_skill = len(list(toloka_client.get_user_skills(skill_id=skill.id)))
n_exams = 3
if not os.path.exists('exam_log.csv'):
    with open('exam_log.csv', 'w') as logfile:
        logfile.write('user_id,skill_id,skill_value,assignment_id,correct_clusters,ground_truth_clusters,datetime\n')

while users_with_skill < total_users and pool.is_open():
    assignments_raw = toloka_client.get_assignments_df(pool_id=pool.id, status=['SUBMITTED'])[[
        'INPUT:images', 'OUTPUT:result', 'GOLDEN:result', 'ASSIGNMENT:link', 'ASSIGNMENT:assignment_id', 'ASSIGNMENT:worker_id', 'ASSIGNMENT:status', 'ACCEPT:verdict'
    ]]
    

    assignments_list = get_assignments_list(assignments_raw)
    # print(assignments_list)

    for assignment in assignments_list:
        skill_value = 0
        correct_clusters = len(assignment['clusters'] & assignment['golden_clusters'])
        ground_truth_clusters = len(assignment['golden_clusters'])

        print(assignment['clusters'] == assignment['golden_clusters'],
              correct_clusters, ground_truth_clusters, end=', ')

    # check clusters, make skill_value  
        if assignment['clusters'] == assignment['golden_clusters']:
            try:
    #                 pass
                toloka_client.accept_assignment(assignment['assignment_id'], 'Excellent')
            except Exception as e:
                logging.info(e)
            skill_value = 100
            print('accepted')

        if correct_clusters == 0:
            try:
    #                 pass
                toloka_client.reject_assignment(assignment['assignment_id'], 'Not correct')
            except Exception as e:
                logging.info(e)
            skill_value = 0
            print('rejected')

        if correct_clusters != 0 and correct_clusters != ground_truth_clusters:
            try:
    #                 pass
                toloka_client.reject_assignment(assignment['assignment_id'], 'Partly correct')
            except Exception as e:
                logging.info(e)
            skill_value = Decimal(correct_clusters / ground_truth_clusters * 100)
            print('rejected, partly')
        try:
            current_value = list(
                toloka_client.get_user_skills(user_id=assignment['worker_id'], skill_id=skill.id)
            )[0].exact_value or 0 # на случай если не нашлось ничего, делаем ноль
        except Exception as e:
            logging.info(e)
            current_value = 0

        toloka_client.set_user_skill(
            skill_id=skill.id, user_id=assignment['worker_id'], value=Decimal(min(float(current_value)+float(skill_value/n_exams), 100))
        )
        
        with open('exam_log.csv', 'a') as logfile:
            logfile.write(f'{assignment["worker_id"]},{skill.id},{skill_value},{assignment["assignment_id"]},{correct_clusters},{ground_truth_clusters},{datetime.datetime.now()}\n')
    sleep(60)
    users_with_skill = len(list(toloka_client.get_user_skills(skill_id=skill.id)))
toloka_client.close_training(training.id)
toloka_client.close_pool(pool.id)

In [None]:
assignments_raw = toloka_client.get_assignments_df(pool_id='30849552', status=['SUBMITTED'])[[
    'INPUT:images', 'OUTPUT:result', 'GOLDEN:result', 'ASSIGNMENT:link', 'ASSIGNMENT:assignment_id', 'ASSIGNMENT:worker_id', 'ASSIGNMENT:status', 'ACCEPT:verdict'
]]
n_exams = 3

assignments_list = get_assignments_list(assignments_raw)
# print(assignments_list)

for assignment in assignments_list:
    skill_value = 0
    correct_clusters = len(assignment['clusters'] & assignment['golden_clusters'])
    ground_truth_clusters = len(assignment['golden_clusters'])

    print(assignment['clusters'] == assignment['golden_clusters'],
          correct_clusters, ground_truth_clusters, end=', ')

# check clusters, make skill_value  
    if assignment['clusters'] == assignment['golden_clusters']:
        try:
#                 pass
            toloka_client.accept_assignment(assignment['assignment_id'], 'Excellent')
        except Exception as e:
            logging.info(e)
        skill_value = 100
        print('accepted')

    if correct_clusters == 0:
        try:
#                 pass
            toloka_client.reject_assignment(assignment['assignment_id'], 'Not correct')
        except Exception as e:
            logging.info(e)
        skill_value = 0
        print('rejected')

    if correct_clusters != 0 and correct_clusters != ground_truth_clusters:
        try:
#                 pass
            toloka_client.reject_assignment(assignment['assignment_id'], 'Partly correct')
        except Exception as e:
            logging.info(e)
        skill_value = Decimal(correct_clusters / ground_truth_clusters * 100)
        print('rejected, partly')
    try:
        current_value = list(
            toloka_client.get_user_skills(user_id=assignment['worker_id'], skill_id='36042')
        )[0].exact_value or 0 # на случай если не нашлось ничего, делаем ноль
    except Exception as e:
        logging.info(e)
        current_value = 0

    toloka_client.set_user_skill(
        skill_id='36042', user_id=assignment['worker_id'], value=Decimal(min(float(current_value)+float(skill_value/n_exams), 100))
    )

    with open('exam_log.csv', 'a') as logfile:
        logfile.write(f'{assignment["worker_id"]},36042,{skill_value},{assignment["assignment_id"]},{correct_clusters},{ground_truth_clusters},{datetime.datetime.now()}\n')
