In [1]:
import pandas as pd
import pingouin as pg
import csv
import itertools
from tqdm.notebook import tqdm  
from sklearn.metrics import cohen_kappa_score

# Inter-Rater Reliability - FLIP
This notebook calculates the irr for the FLIP dataset. We calculate the irr for following combinations:
- between raters for each grade
- overall grade agreement between two raters (qwk)

Results and input can be found in this spreadsheet:
https://docs.google.com/spreadsheets/d/1w2CCEtUIq0cKmVFrkO-RLoTYKl8HQ1GyTaiUo04aZQo

## 1. Setup paths & options

In [2]:
dataset_path = '/home/simon/Downloads/flip_irr/rawdata.csv'
output_dir = '/home/simon/Downloads/flip_irr/' # don't forget the trailing slash
raters = ['Aleksandra', 'Chloe', 'Eleni', 'Julia', 'Klara', 'Pia', 'Tania', 'Taylor']

## 2. Prepare the dataset

For the intra-class-correlation (icc) metric, we need the data to have following format: each row has to be one rating. 

In [3]:
grades = ['Overall', 'Content', 'Organization', 'Vocab', 'Grammar', 'Mechanics']
data = []
with open(dataset_path) as csvfile:
    csvreader = csv.DictReader(csvfile)
    for line in csvreader:
        if len(line['Graded_By_1st']) < 2:
            continue
        data.append({
            'ID': int(line['ID']),
            'Overall': float(line['Overall_Grade_1st']),
            'Content': float(line['Content_Grade_1st']),
            'Organization': float(line['Organiz_Grade_1st']),
            'Vocab': float(line['Vocab_Grade_1st']),
            'Grammar': float(line['Grammar_Grade_1st']),
            'Mechanics': float(line['Mechanics_Grade_1st']),
            'Rater': line['Graded_By_1st']
        })
        data.append({
            'ID': int(line['ID']),
            'Overall': float(line['Overall_Grade_2nd']),
            'Content': float(line['Content_Grade_2nd']),
            'Organization': float(line['Organiz_Grade_2nd']),
            'Vocab': float(line['Vocab_Grade_2nd']),
            'Grammar': float(line['Grammar_Grade_2nd']),
            'Mechanics': float(line['Mechanics_Grade_2nd']),
            'Rater': line['Graded_By_2nd']
        })
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,ID,Overall,Content,Organization,Vocab,Grammar,Mechanics,Rater
0,1,13.0,2.0,2.0,3.0,3.0,3.0,Eleni
1,1,12.0,2.0,2.0,3.0,2.0,3.0,Tania
2,2,11.0,3.0,2.0,2.0,2.0,2.0,Eleni
3,2,10.0,3.0,2.0,2.0,1.0,2.0,Tania
4,4,14.0,4.0,2.0,3.0,3.0,2.0,Eleni
...,...,...,...,...,...,...,...,...
799,820,15.0,4.0,3.0,3.0,2.0,3.0,Julia
800,822,13.0,4.0,3.0,2.0,2.0,2.0,Eleni
801,822,10.0,3.0,3.0,2.0,1.0,1.0,Pia
802,823,19.0,4.0,4.0,3.0,4.0,4.0,Pia


## 3. Calculate the irr for each pair of raters and each grade

For each pair of raters and each grade variable, the ICC3k and QWK metric is calculated.

**ICC3k**: intra-class-correlation
1. A fixed set of k raters rate each target
2. There is no generalization to a larger population of raters.
3. Mean differences between raters is removed, the metric is sensitive to interactions though.
4. A rater is seen as fixed effect (not random effect)

**QWK**: Cohen's weighted kappa (quadratic weights)

In [4]:
results = []
for rater_tuple in tqdm(list(itertools.combinations(raters,2))):
    for grade in grades:
        # get all grades by our two selected raters (rater_tuple[0] & rater_tuple[1]) for ICC
        filtered_df = df[(df['Rater'] == rater_tuple[0]) | (df['Rater'] == rater_tuple[1])]
        filtered_df = filtered_df[filtered_df.duplicated(['ID'], keep=False)]
        
        # get a list of grades for each rater (QWK)
        filtered_r1 = filtered_df[(filtered_df['Rater'] == rater_tuple[0])][grade].to_numpy()
        filtered_r2 = filtered_df[(filtered_df['Rater'] == rater_tuple[1])][grade].to_numpy()

        icc = pg.intraclass_corr(data=filtered_df, targets='ID', raters='Rater', ratings=grade)
        qwk = cohen_kappa_score(filtered_r1, filtered_r2, weights='quadratic')
        results.append({
            'grader1': rater_tuple[0],
            'grader2': rater_tuple[1],
            'grade': grade,
            'ICC2k': icc.iloc[4]['ICC'],
            'ICC3k': icc.iloc[5]['ICC'],
            'qwk': qwk,
            '# of observations': len(filtered_r1)
        })
resultdf = pd.DataFrame.from_dict(results).sort_values('qwk', ascending=False)
resultdf.to_csv(output_dir + 'irr_between_raters.csv')
resultdf

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

  fval = msbetween / mserror
  f1k = msb / msw
  f2k = f3k = msb / mse
  l1 = (f1l - 1) / (f1l + (k - 1))
  u1 = (f1u - 1) / (f1u + (k - 1))
  l3 = (f3l - 1) / (f3l + (k - 1))
  u3 = (f3u - 1) / (f3u + (k - 1))
  fj = msj / mse
  fj = msj / mse
  v = vn / vd
  icc1 = (msb - msw) / (msb + (k - 1) * msw)
  icc2 = (msb - mse) / (msb + (k - 1) * mse + k * (msj - mse) / n)
  icc3 = (msb - mse) / (msb + (k - 1) * mse)
  icc1k = (msb - msw) / msb
  icc2k = (msb - mse) / (msb + (msj - mse) / n)
  icc3k = (msb - mse) / msb
  f1k = msb / msw
  f2k = f3k = msb / mse
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)





Unnamed: 0,grader1,grader2,grade,ICC2k,ICC3k,qwk,# of observations
31,Aleksandra,Tania,Content,1.000,1.000,1.000000,3
166,Tania,Taylor,Grammar,1.000,1.000,1.000000,11
38,Aleksandra,Taylor,Organization,1.000,1.000,1.000000,4
19,Aleksandra,Klara,Content,1.000,1.000,1.000000,3
60,Chloe,Pia,Overall,0.934,0.932,0.867444,13
...,...,...,...,...,...,...,...
57,Chloe,Klara,Vocab,-4.012,-2.403,-0.521739,7
51,Chloe,Julia,Vocab,-8.000,-8.000,-0.551724,5
18,Aleksandra,Klara,Overall,11.979,-3.431,-0.571429,3
37,Aleksandra,Taylor,Content,,,,4


## 4. Calculate the irr as overall agreement between rater 1 and rater 2
First we need to transform the dataset so each row has two ratings. Then we calculate the QWK between rater 1 &2.


**QWK**: Cohen's weighted kappa (quadratic weights)

In [5]:
dfmerged = df.merge(df, on='ID')
dfmerged = dfmerged[(dfmerged['Rater_x'] != dfmerged['Rater_y'])]
dfm = dfmerged[dfmerged.duplicated('ID', keep='last')]

resultByGradeData = []
for grade in grades:
    x = dfmerged[[grade + '_x']].to_numpy()
    y = dfmerged[[grade + '_y']].to_numpy()
    qwk = cohen_kappa_score(x, y, weights='quadratic')
    resultByGradeData.append({
        'grade': grade,
        'qwk': qwk
    })
resultByGrade = pd.DataFrame.from_dict(resultByGradeData).sort_values('qwk', ascending=False)
resultByGrade.to_csv(output_dir + 'irr_overall.csv')
resultByGrade



Unnamed: 0,grade,qwk
0,Overall,0.73175
4,Grammar,0.60854
1,Content,0.603278
5,Mechanics,0.594096
2,Organization,0.528406
3,Vocab,0.468013
