# Task 2 solution

### Reading data

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import os

In [2]:
df = pd.read_csv('file2.csv', delimiter = "\t")
df.columns = [i.lower() for i in df.columns]
print(df.shape)
df.head()

(250000, 5)


Unnamed: 0,login,uid,docid,jud,cjud
0,assessor158,158,0,0,0
1,assessor238,238,0,0,0
2,assessor488,488,0,0,0
3,assessor136,136,0,0,0
4,assessor300,300,0,0,0


In [3]:
print(df.dtypes)

login    object
uid       int64
docid     int64
jud       int64
cjud      int64
dtype: object


In [4]:
# Removing duplicate lines
df = df.drop_duplicates()
print(df.shape)
df.head()

(249999, 5)


Unnamed: 0,login,uid,docid,jud,cjud
0,assessor158,158,0,0,0
1,assessor238,238,0,0,0
2,assessor488,488,0,0,0
3,assessor136,136,0,0,0
4,assessor300,300,0,0,0


In [5]:
# Checking for missing values
df.isna().sum()

login    0
uid      0
docid    0
jud      0
cjud     0
dtype: int64

### Generating metrics for the final dataset

#### The final dataset will consist of 4 metrics:
<ol>
<li>uid - assessor’s id </li>
<li>docid_num - total number of documents under assessment</li>
<li>correct_num - the total number of assessor’s correct grades</li>
<li>efficiency - the efficiency of the assessors’ work in percentage</li>
</ol>  

#### docid_num - total number of documents under assessment

In [6]:
result = df.groupby('uid').docid.count().to_frame()
print(result.shape)
result.head()

(600, 1)


Unnamed: 0_level_0,docid
uid,Unnamed: 1_level_1
0,401
1,412
2,379
3,426
4,418


#### correct_num - the total number of assessor’s correct grades

In [7]:
# Selecting all correct solutions of assessors
correct = df[df.jud == df.cjud]
print(correct.shape)
correct.head()

(210321, 5)


Unnamed: 0,login,uid,docid,jud,cjud
0,assessor158,158,0,0,0
1,assessor238,238,0,0,0
2,assessor488,488,0,0,0
3,assessor136,136,0,0,0
4,assessor300,300,0,0,0


In [8]:
# Calculating the number of correct solutions for each assessor
correct_num = correct.groupby('uid').docid.count()
correct_num.name = 'correct_num'
print(correct_num.shape)
correct_num.head()

(600,)


uid
0    336
1    330
2    303
3    196
4    346
Name: correct_num, dtype: int64

In [9]:
# Adding the 'correct_num' indicator to the final dataset
result = pd.merge(result, correct_num, on='uid', how='inner')
print(result.shape)
result.head()

(600, 2)


Unnamed: 0_level_0,docid,correct_num
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,401,336
1,412,330
2,379,303
3,426,196
4,418,346


#### efficiency - the efficiency of the assessors’ work in percentage

In [10]:
# Calculating the efficiency of assessor’s work in percentage
efficiency = (result.correct_num * 100) / result.docid
efficiency.name = 'efficiency'
result = pd.merge(result, efficiency, on='uid', how='inner')

# Sorting the efficiency of all assessors’ work in ascending order
result = result.sort_values(by='efficiency')
result = result.reset_index()
print(result.shape)
result.head()

(600, 4)


Unnamed: 0,uid,docid,correct_num,efficiency
0,56,411,175,42.579075
1,3,426,196,46.00939
2,118,391,186,47.570332
3,390,412,198,48.058252
4,234,99,48,48.484848
