# Student Agreement

The goal of this notebook is to measure the level of agreement on the novelty of objects among students.

In [1]:
import pandas as pd

coded_data = pd.read_excel("../data/coded_data.xlsx", sheet_name="Sheet1")
len(coded_data)

96

## Agreement

In [2]:
import numpy as np
from collections import Counter


# Collect and count all concetps participants mentioned for each object,
# and also pick the most commonly mentioned concept for each object.

common_concepts = []
all_concepts = []

for i in range(1, 31):
    counts = Counter(
        [
            s.lower().strip()
            for s in coded_data[f"o{i}c"].to_list()
            if s is not np.nan
            and "," not in s  # exclude response that provide multiple concepts
        ]
    )
    common_concepts.append(counts.most_common(1)[0])
    all_concepts.append(counts)

In [3]:
# Count how many participants think they've seen an object before (yes/no)

count_yes_no = [
    Counter([x for x in coded_data[f"seen_object_{i}"] if x is not np.nan])
    for i in range(1, 31)
]

In [6]:
agreement_df = pd.DataFrame(common_concepts, columns=["concept", "count"])

# Add more useful columns to the dataframe
agreement_df["object_id"] = range(1, 31)
agreement_df["ratio"] = agreement_df["count"] / len(coded_data)
agreement_df["all_concepts_counts"] = [dict(x) for x in all_concepts]
agreement_df["n_unique_concepts"] = [len(x) for x in all_concepts]

agreement_df["n_yes"] = [x["yes"] for x in count_yes_no]
agreement_df["n_no"] = [x["no"] for x in count_yes_no]

agreement_df["pct_yes"] = agreement_df["n_yes"] / len(coded_data)
agreement_df["pct_no"] = agreement_df["n_no"] / len(coded_data)

# Adjust the order of the columns
columns = [
    "object_id",
    "n_yes",
    "pct_yes",
    "n_no",
    "pct_no",
    "concept",
    "count",
    "ratio",
    "n_unique_concepts",
    "all_concepts_counts",
]
agreement_df = agreement_df[columns]

In [7]:
# Display selected columns
pd.options.display.float_format = "{:,.2%}".format
agreement_df[columns[:-1]]

Unnamed: 0,object_id,n_yes,pct_yes,n_no,pct_no,concept,count,ratio,n_unique_concepts
0,1,40,41.67%,39,40.62%,rainbow,16,16.67%,16
1,2,42,43.75%,37,38.54%,dumbbell,14,14.58%,25
2,3,38,39.58%,41,42.71%,hexagon,7,7.29%,35
3,4,28,29.17%,51,53.12%,block,6,6.25%,22
4,5,45,46.88%,34,35.42%,toy,10,10.42%,33
5,6,15,15.62%,64,66.67%,donut,7,7.29%,24
6,7,40,41.67%,39,40.62%,tree,12,12.50%,23
7,8,44,45.83%,35,36.46%,stop sign,5,5.21%,33
8,9,61,63.54%,18,18.75%,house,63,65.62%,8
9,10,43,44.79%,36,37.50%,fire hydrant,9,9.38%,39


# Analysis

In [7]:
# Subjective judgement
# Compute the mean, standard deviation and the range of column `pct_no`

agreement_df["pct_no"].describe()

count   3,000.00%
mean       44.34%
std        12.77%
min        18.75%
25%        36.72%
50%        43.75%
75%        52.08%
max        68.75%
Name: pct_no, dtype: float64

In [8]:
agreement_df["ratio"].describe()

count   3,000.00%
mean       15.14%
std        13.14%
min         2.08%
25%         6.25%
50%        10.42%
75%        22.66%
max        65.62%
Name: ratio, dtype: float64

In [9]:
from scipy.stats import spearmanr
import math


def correlation_analysis(x, y):
    """
    Spearman's rank correlation between x an y, get the rs value and p-value (two-sided)
    as well as the 95% confidence interval for the rs value
    """
    r, pvalue = spearmanr(x, y)
    print(f"r = {r * 100:.2f}%, pvalue = {pvalue:.6f}")

    stderr = 1.0 / math.sqrt(len(coded_data) - 3)
    delta = 1.96 * stderr
    lower = math.tanh(math.atanh(r) - delta)
    upper = math.tanh(math.atanh(r) + delta)

    print(f"lower {lower * 100:.2f}%, upper {upper * 100:.2f}%")

In [10]:
correlation_analysis(agreement_df["ratio"], agreement_df["pct_no"])

r = -56.87%, pvalue = 0.001043
lower -69.04%, upper -41.55%


In [11]:
# Overall novelty
# Among all valid students responses, how many reponses contains single concept
cnt_single_concepts = []
cnt_valid_responses = []

for i in range(1, 31):
    n_single_concepts = 0
    n_valid_responses = 0

    for s in coded_data[f"o{i}c"].to_list():
        if s is not np.nan:
            n_valid_responses += 1
            if "," not in s:
                n_single_concepts += 1

    cnt_single_concepts.append(n_single_concepts)
    cnt_valid_responses.append(n_valid_responses)

print("Total number of valid responses:", sum(cnt_valid_responses))
print("Total number of single concept responses:", sum(cnt_single_concepts))

percentage = sum(cnt_single_concepts) / sum(cnt_valid_responses) * 100
print(f"Percentage of single concept responses: {percentage:.2f}%")

Total number of valid responses: 2711
Total number of single concept responses: 1529
Percentage of single concept responses: 56.40%


In [12]:
single_ratios = [x / y for x, y in zip(cnt_single_concepts, cnt_valid_responses)]

In [13]:
correlation_analysis(single_ratios, agreement_df["pct_no"])

r = -73.22%, pvalue = 0.000004
lower -81.33%, upper -62.31%


In [14]:
# Append single concepts count, rate, valid responses count to the dataframe

agreement_df["n_single_concepts"] = cnt_single_concepts
agreement_df["single_ratio"] = single_ratios
agreement_df["n_valid_responses"] = cnt_valid_responses

In [17]:
agreement_df.to_csv("../report/agreement.csv", index=False)