In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
corpus = pd.read_csv("./data/corpus.csv", dtype="string")
corpus.head()

### Programming language distribution

In [None]:
def print_programming_language_info(corpus, language):
    _, axes = plt.subplots(1, 2, sharey=False, sharex=False, figsize=(24, 7))

    df = corpus[corpus.NaturalLanguageID == language]
    print(f"Dataframe {language} has shape {df.shape}") 
        
    # Programming language 
    print(f"It contains comments for {df.ProgrammingLanguageID.value_counts().size} programming languages.")
    ax = df["ProgrammingLanguageID"].value_counts().plot(ax = axes[0], kind="bar", title=f"{language} # of comments per programming language")
    for p in ax.patches:
        ax.annotate("{0:g}".format(p.get_height()), (p.get_x() + 0.12, p.get_height() + 5))

    df["ProgrammingLanguageID"].value_counts(normalize=True).plot.pie(ax = axes[1], autopct=lambda x: "{:.2f}%".format(x))
        
    plt.show()

In [None]:
print_programming_language_info(corpus, "SR")

In [None]:
print_programming_language_info(corpus, "EN")

### Length distribution

In [None]:
def print_comment_length(corpus, language):
    _, ax = plt.subplots(figsize=(36, 7))

    df = corpus[corpus.NaturalLanguageID == language]
    print(f"{language} Average comment length is {df['Comment'].str.len().mean():.2f} characters.")
    print(f"{language} Maximal comment length is {df['Comment'].str.len().max():.0f} characters.")

    # Comment size distribution
    ax = df["Comment"].str.len().plot(kind="hist", ax=ax, bins=100, title=f"{language} Distribution of comment length")
    for p in [p for p in ax.patches if p.get_height() > 0]:
        ax.annotate(str(int(p.get_height())), (p.get_x() * 1.003, p.get_height() * 1.01))
    plt.xticks([_ for _ in range(0, 4000, 100)])
    # print(df[df["Comment"].str.len() > 1000]["Comment"])
    plt.show()


In [None]:
print_comment_length(corpus, "SR")

In [None]:
print_comment_length(corpus, "EN")

### Number of words distribution

In [None]:
def print_comment_num_words(corpus, language):
    _, ax = plt.subplots(figsize=(36, 7))

    df = corpus[corpus.NaturalLanguageID == language]
    print(f"{language} Average comment number of words is {df['Comment'].str.split().apply(len).mean():.0f} words.")
    print(f"{language} Maximal comment number of words is {df['Comment'].str.split().apply(len).max():.0f} words.")

    # Comment size distribution
    ax = df['Comment'].str.split().apply(len).plot(kind="hist", ax=ax, bins=100, title=f"{language} Distribution of comment length")
    for p in [p for p in ax.patches if p.get_height() > 0]:
        ax.annotate(str(int(p.get_height())), (p.get_x() * 1.003, p.get_height() * 1.01))
    plt.xticks([_ for _ in range(0, 500, 10)])
    # print(df[df["Comment"].str.len() > 1000]["Comment"])
    plt.show()


In [None]:
print_comment_num_words(corpus, "SR")

In [None]:
print_comment_num_words(corpus, "EN")

### Annotation data

In [None]:
def print_annotation_data(corpus, language):
    _, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), constrained_layout=True)
        
    df = corpus[corpus.NaturalLanguageID == language]

    ax1 = df["ClassM"].value_counts().plot(kind="bar", ax=ax1, title=f"{language} # of comments per class")
    ax1 = df["ClassA"].value_counts().plot(kind="bar", ax=ax1, color="red")
    for p in ax1.patches:
        # ax1.annotate(str(p.get_height()), (p.get_x() * 1.003, p.get_height() * 1.01))
        ax1.annotate("{0:g}".format(p.get_height()), (p.get_x() + 0.06, p.get_height() + 8))
    ax1.legend()


    ax2 = df["ClassM"].value_counts().plot(kind="pie", ax=ax2, title=f"{language} # of comments per class M", autopct=lambda x: "{:.2f}%".format(x))
    ax3 = df["ClassA"].value_counts().plot(kind="pie", ax=ax3, title=f"{language} # of comments per class A", autopct=lambda x: "{:.2f}%".format(x))
    
    percentages = pd.DataFrame()
    percentages["ClassM"] = df["ClassM"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    percentages["ClassA"] = df["ClassA"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    print(f"{language} \n", percentages)
    plt.show()


In [None]:
print_annotation_data(corpus, "SR")

In [None]:
print_annotation_data(corpus, "EN")

### Annotation agreement

In [None]:
def print_annotation_agreement(corpus, language):
    print(f"Confusion matrix for {language}")
    df = corpus[corpus.NaturalLanguageID == language]
    df = df[~df["ClassA"].isna()]

    _, ax = plt.subplots(figsize=(7,7))
    ConfusionMatrixDisplay.from_predictions(y_true=df["ClassM"], y_pred=df["ClassA"], xticks_rotation="vertical", ax=ax)

In [None]:
print_annotation_agreement(corpus, "SR")

In [None]:
print_annotation_agreement(corpus, "EN")

### Krippendorf alpha

In [None]:
import numpy as np
import krippendorff

In [None]:
def kri(data):
    krp = data[["ClassA", "ClassM"]]
    krp_data = np.array(krp.T, dtype=str)
    return krippendorff.alpha(reliability_data=krp_data, level_of_measurement="nominal")


8 classes

In [None]:
krp = corpus[~pd.isna(corpus.ClassA)]

print("all", kri(krp))
print("en", kri(krp[krp.NaturalLanguageID == "EN"]))
print("sr", kri(krp[krp.NaturalLanguageID == "SR"]))


6 classes

In [None]:
krp.loc[:,"ClassM"] = krp["ClassM"].apply(lambda category: "functional" if category.startswith("functional") else category)
krp.loc[:,"ClassA"] = krp["ClassA"].apply(lambda category: "functional" if category.startswith("functional") else category)

In [None]:
print("all", kri(krp))
print("en", kri(krp[krp.NaturalLanguageID == "EN"]))
print("sr", kri(krp[krp.NaturalLanguageID == "SR"]))

2 classes

In [None]:
krp.loc[:,"ClassM"] = krp["ClassM"].apply(lambda category: "functional" if category.startswith("functional") else "non-functional")
krp.loc[:,"ClassA"] = krp["ClassA"].apply(lambda category: "functional" if category.startswith("functional") else "non-functional")


In [None]:
print("all", kri(krp))
print("en", kri(krp[krp.NaturalLanguageID == "EN"]))
print("sr", kri(krp[krp.NaturalLanguageID == "SR"]))