In [1]:
import json
import krippendorff_alpha
import numpy as np
import os
import pandas as pd
with open("file_paths.json", "r") as fp:
    filepaths = json.load(fp)
    
def conversational_turns(sheet):
    """
    Function to count conversational turns in a 
    human-coded spreadsheet using the LENA Foundation
    definition, "(back and forth interactions between 
    a child and adult) are computed by counting the 
    number of times utterances spoken by the child or 
    adult were responded to by another person."
    
    Parameters
    ----------
    sheet: DataFrame
    
    Returns
    -------
    turn_count: int
    
    Example
    -------
    >>> import numpy as np
    >>> import pandas as pd
    >>> conversational_turns(
    ...     pd.DataFrame({
    ...         "Child": [
    ...             "spontaneous speech",
    ...             np.nan,
    ...             "no response",
    ...             np.nan,
    ...             "verbal response"
    ...         ],
    ...         "Parent": [
    ...             np.nan,
    ...             "neutral talk",
    ...             np.nan,
    ...             "open-ended question",
    ...             np.nan
    ...         ]
    ...     })
    ... )
    2
    """
    turn_count = 0
    half_turn = {
        "Child": False,
        "Parent": False,
        "Stranger ": False
    }
    stranger = sheet[
        "Stranger "
    ] if "Stranger " in sheet.columns else pd.Series(
        sheet["Child"].apply(
            lambda x: np.nan
        )
    )
    for i in range(
        len(
            sheet["Child"]
        )
    ):
        if sheet.loc[i, "Child"] in vocalizations["child"]:
            if half_turn["Parent"] or half_turn["Stranger "]:
                turn_count += 1
                half_turn["Parent"] = half_turn["Stranger "] = False
            half_turn["Child"] = True
        if sheet.loc[i, "Parent"] in vocalizations["adult"]:
            if half_turn["Child"] or half_turn["Stranger "]:
                turn_count += 1
                half_turn["Child"] = half_turn["Stranger "] = False
            half_turn["Parent"] = True
        if stranger[i] in vocalizations["adult"]:
            if half_turn["Child"] or half_turn["Parent"]:
                turn_count += 1
                half_turn["Child"] = half_turn["Stranger "] = False
            half_turn["Stranger "] = True
        if (
            sheet.loc[i, "Child"] not in vocalizations["child"]
        ) and (
            sheet.loc[i, "Parent"] not in vocalizations["adult"]
        ) and (
            stranger[i] not in vocalizations["adult"]
        ):
            half_turn["Child"] = half_turn["Parent"] = half_turn["Stranger "] = False
    return(turn_count)

def krippendorff_coder_format(
    human_ratings,
    LENA_ratings,
    measure,
    ursis
):
    """
    Function to take data from Multi-index DataFrames 
    and convert those data to the format
    [
        {unit1:value, unit2:value, ...},  # coder 1
        {unit1:value, unit3:value, ...},   # coder 2
        ...                            # more coders
    ]
    
    Parameters
    ----------
    human_ratings: DataFrame
        Multi-index DataFrame (indices = URSI, session,
        file, in that sequence)
    
    LENA_ratings: DataFrame
        Multi-index DataFrame (indices = URSI, session,
        in that sequence)

    measure: string
        column header to collect
        
    ursis: iterable
        iterable of values for first index
        
    Returns
    -------
    formatted: dictionary
        key: string
            ursi
        value: list
            Krippendorff-formatted codings
    """
    return(
        {
            ursi: [
                *[
                    {
                        "A": (
                            human_ratings.loc[
                                (ursi, "A1", f),
                                measure
                            ] + human_ratings.loc[
                                (ursi, "A2", f),
                                measure
                            ] + human_ratings.loc[
                                (ursi, "A3", f),
                                measure
                            ]
                        ),
                        "B": human_ratings.loc[
                            (ursi, "B", f),
                            measure
                        ],
                        "C": human_ratings.loc[
                            (ursi, "C", f),
                            measure
                        ]
                    } for f in list(
                        human_ratings.loc[
                            (ursi, "B"),
                        ].index
                    )
                ],
                *[
                    {
                        "A": LENA_ratings.loc[
                            (ursi, "A"),
                            measure
                        ],
                        "B": LENA_ratings.loc[
                            (ursi, "B"),
                            measure
                        ],
                        "C": LENA_ratings.loc[
                            (ursi, "C"),
                            measure
                        ]
                    }
                ]
            ] for ursi in ursis
        }
    )

def pearson_coder_format(
    human_ratings,
    LENA_ratings,
    measure,
    ursis
):
    """
    Function to take data from Multi-index DataFrames 
    and convert those data to the format
    x : array_like
        A 1-D array containing multiple variables
        and observations. Each row of x represents a
        variable, and each column a single observation
        of all those variables.
    y : array_like, optional
        An additional set of variables and observations.
        y has the same shape as x.

    Parameters
    ----------
    human_ratings: DataFrame
        Multi-index DataFrame (indices = URSI, session,
        file, in that sequence)
    
    LENA_ratings: DataFrame
        Multi-index DataFrame (indices = URSI, session,
        in that sequence)

    measure: string
        column header to collect
        
    ursis: iterable
        iterable of values for first index
        
    Returns
    -------
    formatted: dictionary
        key: string
            ursi
        value: list
            [x, y]
            x: array
            y: array
    """
    return(
        {
            ursi: [
                np.array([
                    np.mean(
                        [
                            (
                                human_ratings.loc[
                                    (ursi, "A1", f),
                                    measure
                                ] + human_ratings.loc[
                                    (ursi, "A2", f),
                                    measure
                                ] + human_ratings.loc[
                                    (ursi, "A3", f),
                                    measure
                                ]
                            ) for f in list(
                                human_ratings.loc[
                                    (ursi, "B"),
                                ].index
                            )
                        ]
                    ),
                    np.mean([
                        human_ratings.loc[
                            (ursi, "B", f),
                            measure
                        ] for f in list(
                            human_ratings.loc[
                                (ursi, "B"),
                            ].index)
                    ]),
                    np.mean([
                        human_ratings.loc[
                            (ursi, "C", f),
                            measure
                        ] for f in list(
                            human_ratings.loc[
                                (ursi, "C"),
                            ].index)
                    ])
                ]),
            np.array([
                    LENA_ratings.loc[
                        (ursi, "A"),
                        measure
                    ],
                    LENA_ratings.loc[
                        (ursi, "B"),
                        measure
                    ],
                    LENA_ratings.loc[
                        (ursi, "C"),
                        measure
                    ]
                ])
            ] for ursi in ursis
        }
    )

sessions = {
    "A1",
    "B",
    "A2",
    "C",
    "A3"
}

vocalizations = {
    "child": {
        "verbal response",
        "noise",
        "spontaneous speech"
    },
    "adult": {
        "open-ended question",
        "forced choice question",
        "yes/no question",
        "command",
        "neutral talk"
    }
}

In [2]:
LENA = pd.read_csv(
    os.path.join(
        filepaths["LENA"],
        "CPP_data.csv"
    )
)
LENA["URSI"] = LENA["URSI"].apply(
    lambda u: u.replace(
        "M04",
        "M004"
    )
)
ursis = set(LENA["URSI"].unique())
humans = {
    fp: {
        "ursi": fp[:9],
        **{sheetname: pd.read_excel(
            os.path.join(
                filepaths["humans"],
                fp
            ),
            sheet_name=sheetname
        ) for sheetname in sessions}
    } for fp in os.listdir(
        filepaths["humans"]
    ) if fp[:9] in ursis
}

In [3]:
human_ratings = pd.DataFrame({
    (ursi, session, f): {
        "Turn_Count": conversational_turns(
            humans[
                f
            ][
                session
            ]
        ),
        "Child_Voc_Count": len(
            [
                v for v in humans[
                    f
                ][
                    session
                ][
                    "Child"
                ] if v in vocalizations["child"]
            ]
        )
    } for ursi in ursis for session in sessions for f in humans if f.startswith(ursi) 
}).T.sort_index()
LENA_ratings = LENA.groupby(["URSI", "Session"]).sum()[["Turn_Count","Child_Voc_Count"]]

In [4]:
voc_counts_k = krippendorff_coder_format(
    human_ratings,
    LENA_ratings,
    "Child_Voc_Count",
    ursis
)
turns_k = krippendorff_coder_format(
    human_ratings,
    LENA_ratings,
    "Turn_Count",
    ursis
)
voc_counts_p = pearson_coder_format(
    human_ratings,
    LENA_ratings,
    "Child_Voc_Count",
    ursis
)
turns_p = pearson_coder_format(
    human_ratings,
    LENA_ratings,
    "Turn_Count",
    ursis
)
pd.DataFrame({
    ursi: {
        ("Child_Voc_Count", "Krippendorff's α"): krippendorff_alpha.krippendorff_alpha(
            data=voc_counts_k[ursi],
            metric=krippendorff_alpha.ratio_metric
        ),
        ("Child_Voc_Count", "Pearson's r"): np.corrcoef(
            *voc_counts_p[ursi]
        )[1,0],
        ("Turn_Count", "Krippendorff's α"): krippendorff_alpha.krippendorff_alpha(
            data=turns_k[ursi],
            metric=krippendorff_alpha.ratio_metric
        ),
        ("Turn_Count", "Pearson's r"): np.corrcoef(
            *turns_p[ursi]
        )[1,0]
    } for ursi in ursis
}).T

  return ((a-b)/(a+b))**2


Unnamed: 0_level_0,Child_Voc_Count,Child_Voc_Count,Turn_Count,Turn_Count
Unnamed: 0_level_1,Krippendorff's α,Pearson's r,Krippendorff's α,Pearson's r
M00412434,0.408218,0.980946,-0.129023,0.99996
M00473061,0.931355,0.963213,0.070464,0.998186
M00490836,0.482149,0.999417,0.301668,0.999632
M00402147,,0.981981,,-0.020615
M00424384,0.808156,0.99995,0.348178,0.999695
M00440011,0.987529,0.998082,0.331432,0.938586
M00470412,0.917191,0.999054,-0.009046,0.998357
M00426908,-0.191879,0.61649,-0.369303,0.706145
M00492101,0.653929,0.998996,0.422952,0.654069
M00409047,0.892232,0.98012,0.050676,0.972384
