# Kappa Fleiss between three coders

Calculates the Kappa fleiss between three (Felipe, Guilherme, YouTube-8m) coders using two approaches: most frequent category and weighted.

In [1]:
import pandas as pd
from utils import str2List, sumOfSquares

In [2]:
ads = pd.read_csv('../processed_data/ads_str_column.csv')
cat = pd.read_csv('../processed_data/mapped_categories.csv')

In [3]:
ads['labels'] = ads['categories'].apply(str2List, 1)

In [4]:
cat_dict = dict(zip(cat['label'], cat['categories']))

In [5]:
def map_cat(lst):
    tmp = []
    for k in lst:
        try:
            tmp.append(cat_dict[k])
        except:
            pass
    return tmp

In [6]:
ads['categories'] = ads['labels'].apply(map_cat, 1)

## Most Frequent approach

In [7]:
def most_frequent(lst): 
    return max(set(lst), key = lst.count)

In [8]:
ads['category_youtube8m'] = ads['categories'].apply(most_frequent, 1)

In [9]:
ads.head()

Unnamed: 0,videoId,categories,labels,category_youtube8m
0,-0iPHNTYfOI,"[Food & Drink, Autos & Vehicles, Pets & Animal...","[Food, Vehicle, Animal, Car, KFC, Chocolate, T...",Food & Drink
1,-4P8-sJqukY,"[Hobbies & Leisure, Autos & Vehicles, Hobbies ...","[Hot, air, balloon, Balloon, (aeronautics), Vi...",Arts & Entertainment
2,-5bLu-NETl0,"[Internet & Telecom, Autos & Vehicles, Compute...","[Website, Video, game, Microsoft, Windows, Veh...",Computers & Electronics
3,-7TzvnMnflE,"[Arts & Entertainment, Arts & Entertainment, A...","[Cartoon, Animation, Vehicle, Video, game, Car...",Arts & Entertainment
4,-8M5Ekbt2hA,"[Games, Autos & Vehicles, Arts & Entertainment...","[Video, game, Game, Rayman, Vehicle, Animation...",Arts & Entertainment


In [10]:
felipe_coding = pd.read_csv("../data/felipe_coding.csv")
guilherme_coding = pd.read_csv("../data/guilherme_coding.csv")

felipe_coding = felipe_coding[['ad_id', 'what is the category of the product?']]
felipe_coding.columns = ['ad_id', 'category_felipe']
felipe_coding = felipe_coding.dropna()

guilherme_coding = guilherme_coding[['ad_id', 'what is the category of the product?']]
guilherme_coding.columns = ['ad_id', 'category_guilherme']
guilherme_coding = guilherme_coding.dropna()

In [11]:
coding = ads.copy()
coding = coding[['videoId', 'category_youtube8m']]

In [12]:
coding = coding.join(felipe_coding.set_index('ad_id'), on='videoId')
coding = coding.join(guilherme_coding.set_index('ad_id'), on='videoId')

In [13]:
coding = coding.dropna()
coding = coding.drop_duplicates()

In [14]:
coding.head()

Unnamed: 0,videoId,category_youtube8m,category_felipe,category_guilherme
98,1YBY1Hyo7ls,Computers & Electronics,Arts & Entertainment,Arts & Entertainment
108,1kMEXhOMznQ,Pets & Animals,Business & Industrial,Business & Industrial
116,1u2nS8jcc5g,Computers & Electronics,Home & Garden,Home & Garden
144,2iTNc8rsFo8,Shopping,Clothing,Clothing
156,34AJ7MJNqFg,Arts & Entertainment,Arts & Entertainment,Arts & Entertainment


In [15]:
def fleiss_kappa(df):  
    k_list = list(set([*df.iloc[:,1], *df.iloc[:,2], *df.iloc[:,3]]))
    
    kappa_table = pd.DataFrame(0, index=df.videoId, columns=k_list)
    
    N = len(df)
    n = 3
    k = len(k_list)
    
    for _,row in df.iterrows():
        kappa_table.at[row['videoId'], row['category_youtube8m']] += 1
        kappa_table.at[row['videoId'], row['category_felipe']] += 1
        kappa_table.at[row['videoId'], row['category_guilherme']] += 1
        
    kappa_table['P'] = 1/(n*(n-1)) * (kappa_table.apply(sumOfSquares, 1) - n)
    p = kappa_table.iloc[:,:-1].sum()/kappa_table.iloc[:,:-1].sum().sum()
    
    P_hat = (1/N) * (kappa_table['P'].sum())
    P_hat_e = sumOfSquares(p)
    kappa = (P_hat - P_hat_e)/(1 - P_hat_e)
    return kappa

In [16]:
fleiss_kappa(coding)

0.35272317112471807

|    k        |interpretation            |
|   -------   |   -------                |
|    <0       | Poor agreement           |
|  0.01-0.20  | Slight agreement         |
| **0.21-0.40**  | **Fair agreement**           |
|  0.41-0.60  | Moderate agreement       |
|  0.61-0.80  |	Substantial agreement    |
|  0.81-1.00  | Almost perfect agreement |

## Highest weight approach