# Datasets and goldensets analysis
You can use it only to analyse classification tasks, such as cat vs dog classification or yes/no checking projects.

Prepare environment and import all we'll need.

In [1]:
%%capture
!pip install toloka-kit==0.1.13
!pip install crowd-kit==0.0.5
!pip install pandas
!pip install numpy
!pip install plotly

import datetime
import sys
import time
import logging

import pandas as pd
import numpy as np
import plotly.express as px

import toloka.client as toloka
import crowdkit

logging.basicConfig(
    format='[%(levelname)s] %(name)s: %(message)s',
    level=logging.INFO,
    stream=sys.stdout,
)

Сreate toloka-client instance. All api calls will go through it. More about OAuth token in our [Learn the basics example](https://github.com/Toloka/toloka-kit/tree/main/examples/0.getting_started/0.learn_the_basics) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Toloka/toloka-kit/blob/main/examples/0.getting_started/0.learn_the_basics/learn_the_basics.ipynb)

In [61]:
toloka_client = toloka.TolokaClient(input("Enter your token:"), 'PRODUCTION')  # Or switch to 'SANDBOX'
print(toloka_client.get_requester())

Requester(_unexpected={}, id='b39ea2ce2474c437ed0ee0d4aeec630b', balance=Decimal('6572.8630'), public_name={'EN': 'Ya.Apollyon', 'FR': 'Ya.Apollyon', 'ID': 'Ya.Apollyon', 'RU': 'Я.Аполлион', 'TR': 'Ya.Apollyon'}, company=Requester.Company(_unexpected={}, id='1', superintendent_id='56caaaeeea84b3b3765420ef45a08262'))


## Load answers
from Toloka

In [65]:
pool_id = '26949990'
assert pool_id != ''

label_field_name = 'result'
assert label_field_name != ''

answers = []
golden_set = {}
input_names = []
for assignment in toloka_client.get_assignments(pool_id=pool_id, status='ACCEPTED'):
    for task, solution in zip(assignment.tasks, assignment.solutions):
        answers.append([task.id, solution.output_values[label_field_name], assignment.user_id, *task.input_values.values()])
        input_names = task.input_values.keys()
        if task.known_solutions:
            golden_set[task.id] = task.known_solutions[0].output_values[label_field_name]

# Prepare dataframe
input_names = [f'INPUT:{val}' for val in input_names]
incoming_df = pd.DataFrame(answers, columns=['task', 'label', 'performer', *input_names])
answers_df = incoming_df[['task', 'label', 'performer']]
golden_ser = pd.Series(data=golden_set)

print(f'answers_df - {type(golden_ser)}')
display(answers_df)

print(f'golden_ser - {type(golden_ser)} - {len(golden_ser)}')
display(golden_ser)

answers_df - <class 'pandas.core.series.Series'>


Unnamed: 0,task,label,performer
0,00019b3966--611abff88d794f3da6a6e7e6,cat,205503686aa269c85b5a226a3ca87329
1,00019b3966--611abff38d794f3da6a6e732,dog,205503686aa269c85b5a226a3ca87329
2,00019b3966--611abff38d794f3da6a6e738,dog,205503686aa269c85b5a226a3ca87329
3,00019b3966--611abff18d794f3da6a6e6ca,dog,205503686aa269c85b5a226a3ca87329
4,00019b3966--611abffb8d794f3da6a6e89a,cat,205503686aa269c85b5a226a3ca87329
...,...,...,...
1210,00019b3966--611abff48d794f3da6a6e748,dog,1fd7baed0b2d446e7f886051d1c905f0
1211,00019b3966--611abff88d794f3da6a6e7fb,cat,1fd7baed0b2d446e7f886051d1c905f0
1212,00019b3966--611abff98d794f3da6a6e82c,dog,1fd7baed0b2d446e7f886051d1c905f0
1213,00019b3966--611abffd8d794f3da6a6e8ad,cat,1fd7baed0b2d446e7f886051d1c905f0


golden_ser - <class 'pandas.core.series.Series'> - 15


00019b3966--611abff18d794f3da6a6e6ca    dog
00019b3966--611abff18d794f3da6a6e6d0    cat
00019b3966--611abff18d794f3da6a6e6d8    dog
00019b3966--611abff18d794f3da6a6e6d2    dog
00019b3966--611abff18d794f3da6a6e6e4    dog
00019b3966--611abff18d794f3da6a6e6da    dog
00019b3966--611abff18d794f3da6a6e6d4    cat
00019b3966--611abff18d794f3da6a6e6d6    cat
00019b3966--611abff18d794f3da6a6e6dc    dog
00019b3966--611abff18d794f3da6a6e6ce    cat
00019b3966--611abff18d794f3da6a6e6e6    dog
00019b3966--611abff18d794f3da6a6e6e2    dog
00019b3966--611abff18d794f3da6a6e6e0    dog
00019b3966--611abff18d794f3da6a6e6cc    dog
00019b3966--611abff18d794f3da6a6e6de    cat
dtype: object

Or from tsv-file

In [None]:
label_field_name = 'label'
assert label_field_name != ''

incoming_df = pd.read_csv('assignments_from_pool.tsv', sep='\t')
incoming_df = incoming_df.rename(columns={
    'ASSIGNMENT:task_id': 'task',
    f'OUTPUT:{label_field_name}': 'label',
    f'GOLDEN:{label_field_name}': 'truth_label',
    'ASSIGNMENT:worker_id': 'performer',
})
answers_df = incoming_df[['task', 'label', 'truth_label', 'performer']]

golden_ser = answers_df[answers_df['truth_label'].notnull()]
golden_ser = golden_ser[['task', 'truth_label']]
golden_ser = golden_ser.set_index('task')
golden_ser = golden_ser.squeeze()
golden_ser = golden_ser[~golden_ser.index.duplicated(keep='first')]

print(f'answers_df - {type(golden_ser)}')
display(answers_df)

print(f'golden_ser - {type(golden_ser)} - {len(golden_ser)}')
display(golden_ser)

---
# Prepare some datasets

In [66]:
from crowdkit.aggregation import GoldMajorityVote

aggregator = GoldMajorityVote()

# standart way to aggregate labels if you have golden-set
result_labels = aggregator.fit_predict(answers_df, golden_ser)

performers_skills = aggregator.skills_
performers_skills.name = 'performer_skill'
probas = aggregator.probas_  # label probability distributions

Now we already could investigate results. First we look at the raw data. Then show this datas on graphs.

Prepare several metrics. First metric for all dataset: golden-set and all other tasks

In [68]:
from crowdkit.metrics.data._classification import consistency, uncertainty

golden_answers = answers_df[answers_df['task'].isin(golden_ser.index)]
general_answers = answers_df[~answers_df['task'].isin(golden_ser.index)]

print(f'uncertainty for golden-set: {uncertainty(golden_answers, performers_skills)}')
print(f'consistency for golden-set: {consistency(golden_answers, performers_skills)}')

print(f'\nuncertainty for other tasks: {uncertainty(general_answers, performers_skills)}')
print(f'consistency for other tasks: {consistency(general_answers, performers_skills)}')


divide by zero encountered in log


divide by zero encountered in log



uncertainty for golden-set: 0.0
consistency for golden-set: 1.0

uncertainty for other tasks: 0.06830749981356121
consistency for other tasks: 0.9378238341968912


In [69]:
from crowdkit.metrics.data._classification import consistency, uncertainty

print('uncertainty')
tasks_uncertainty = uncertainty(answers_df, performers_skills, by_task=True)
display(tasks_uncertainty)

print('consistency')
tasks_consistency = consistency(answers_df, performers_skills, by_task=True)
display(tasks_consistency)

uncertainty



divide by zero encountered in log



task
00019b3966--611abff18d794f3da6a6e6ca   -0.000000
00019b3966--611abff18d794f3da6a6e6cc   -0.000000
00019b3966--611abff18d794f3da6a6e6ce   -0.000000
00019b3966--611abff18d794f3da6a6e6d0   -0.000000
00019b3966--611abff18d794f3da6a6e6d2   -0.000000
                                          ...   
00019b3966--611ac432f4540f4a31ca427f   -0.000000
00019b3966--611ac432f4540f4a31ca4282    1.098612
00019b3966--611ac432f4540f4a31ca4284    1.098612
00019b3966--611ac432f4540f4a31ca4287    1.098612
00019b3966--611ac432f4540f4a31ca428b    1.098612
Length: 208, dtype: float64

consistency


task
00019b3966--611abff18d794f3da6a6e6ca    1.0
00019b3966--611abff18d794f3da6a6e6cc    1.0
00019b3966--611abff18d794f3da6a6e6ce    1.0
00019b3966--611abff18d794f3da6a6e6d0    1.0
00019b3966--611abff18d794f3da6a6e6d2    1.0
                                       ... 
00019b3966--611ac432f4540f4a31ca427f    1.0
00019b3966--611ac432f4540f4a31ca4282    0.0
00019b3966--611ac432f4540f4a31ca4284    0.0
00019b3966--611ac432f4540f4a31ca4287    0.0
00019b3966--611ac432f4540f4a31ca428b    0.0
Length: 208, dtype: float64

---
# Investigate results

Let's split results into two: golden and general tasks.

In [70]:
golden_tasks_uncertainty = tasks_uncertainty[tasks_uncertainty.index.isin(golden_ser.index)]
general_tasks_uncertainty = tasks_uncertainty[~tasks_uncertainty.index.isin(golden_ser.index)]

golden_tasks_consistency = tasks_consistency[tasks_consistency.index.isin(golden_ser.index)]
general_tasks_consistency = tasks_consistency[~tasks_consistency.index.isin(golden_ser.index)]

def show_plots(uncertainty, consistency, name=''):
    fig = px.histogram(uncertainty, labels={'value':'uncertainty', 'count':'task count'}, title=f'{name} tasks uncertainty')
    fig.show()
    fig = px.histogram(consistency, labels={'value':'consistency', 'count':'task count'}, title=f'{name} tasks consistency')
    fig.show()

Show histogram for general tasks

In [71]:
show_plots(general_tasks_uncertainty, general_tasks_consistency, 'General')

Show histogram for golden-set tasks

In [72]:
show_plots(golden_tasks_uncertainty, golden_tasks_consistency, 'Golden')

Prepare to preview suspicious tasks.

In [74]:
def get_suspicious_tasks(tasks_uncertainty, tasks_consistency, uncertainty_more = 1, consistency_less = 0.5):
    suspicious_tasks = set(tasks_uncertainty[tasks_uncertainty > uncertainty_more].index.values.tolist())
    suspicious_tasks = suspicious_tasks | set(tasks_consistency[tasks_consistency < consistency_less].index.values.tolist())
    return suspicious_tasks

suspicious_golden_tasks = get_suspicious_tasks(golden_tasks_uncertainty, golden_tasks_consistency)
print(suspicious_golden_tasks)

{'00019b3966--611ac432f4540f4a31ca4282', '00019b3966--611ac432f4540f4a31ca427c', '00019b3966--611abff28d794f3da6a6e71a', '00019b3966--611abff28d794f3da6a6e706', '00019b3966--611abffa8d794f3da6a6e86b', '00019b3966--611abff58d794f3da6a6e791', '00019b3966--611ac432f4540f4a31ca4284', '00019b3966--611abff28d794f3da6a6e71c', '00019b3966--611abffa8d794f3da6a6e84e', '00019b3966--611ac432f4540f4a31ca4287', '00019b3966--611ac432f4540f4a31ca428b', '00019b3966--611abff38d794f3da6a6e722'}


Lets look at suspicious tasks in web-interface

In [76]:
from IPython.display import Markdown
pool = toloka_client.get_pool(pool_id)
links = '\n   - '.join(
    f'[{task_id}](https://toloka.yandex.com/requester/project/{pool.project_id}/pool/{pool_id}/tasks/edit?taskId={task_id})'
    for task_id in suspicious_golden_tasks
)
display(Markdown(f'# Suspicious tasks\n\n   - {links}'))

# Suspicious tasks

   - [00019b3966--611ac432f4540f4a31ca4282](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611ac432f4540f4a31ca4282)
   - [00019b3966--611ac432f4540f4a31ca427c](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611ac432f4540f4a31ca427c)
   - [00019b3966--611abff28d794f3da6a6e71a](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abff28d794f3da6a6e71a)
   - [00019b3966--611abff28d794f3da6a6e706](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abff28d794f3da6a6e706)
   - [00019b3966--611abffa8d794f3da6a6e86b](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abffa8d794f3da6a6e86b)
   - [00019b3966--611abff58d794f3da6a6e791](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abff58d794f3da6a6e791)
   - [00019b3966--611ac432f4540f4a31ca4284](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611ac432f4540f4a31ca4284)
   - [00019b3966--611abff28d794f3da6a6e71c](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abff28d794f3da6a6e71c)
   - [00019b3966--611abffa8d794f3da6a6e84e](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abffa8d794f3da6a6e84e)
   - [00019b3966--611ac432f4540f4a31ca4287](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611ac432f4540f4a31ca4287)
   - [00019b3966--611ac432f4540f4a31ca428b](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611ac432f4540f4a31ca428b)
   - [00019b3966--611abff38d794f3da6a6e722](https://toloka.yandex.com/requester/project/58004/pool/26949990/tasks/edit?taskId=00019b3966--611abff38d794f3da6a6e722)

Or filter our raw answers dataset and leav only suspicious rows. So you have all inputs and can find this tasks some where else in you incoming data.

In [77]:
column_names = [value for value in incoming_df.columns.values.tolist() if value.startswith('INPUT') or value == 'task']
suspicious_df = incoming_df[column_names].groupby(by=['task']).first().reset_index()
suspicious_df = suspicious_df[suspicious_df['task'].isin(suspicious_golden_tasks)]
display(suspicious_df)

Unnamed: 0,task,INPUT:image
28,00019b3966--611abff28d794f3da6a6e706,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
37,00019b3966--611abff28d794f3da6a6e71a,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
38,00019b3966--611abff28d794f3da6a6e71c,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
41,00019b3966--611abff38d794f3da6a6e722,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
93,00019b3966--611abff58d794f3da6a6e791,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
163,00019b3966--611abffa8d794f3da6a6e84e,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
173,00019b3966--611abffa8d794f3da6a6e86b,https://tlk.s3.yandex.net/dataset/cats_vs_dogs...
202,00019b3966--611ac432f4540f4a31ca427c,https://storage.yandexcloud.net/ztul-datasets/...
204,00019b3966--611ac432f4540f4a31ca4282,https://storage.yandexcloud.net/ztul-datasets/...
205,00019b3966--611ac432f4540f4a31ca4284,https://storage.yandexcloud.net/ztul-datasets/...
