Data Preprocessing

In [None]:
import pandas as pd

# Load the uploaded file
stimuli = pd.read_csv(r"C:\Users\lisus\Downloads\stimuli.csv", encoding="utf-8")

# Changing into long format for surprisal computing
long_stimuli = stimuli.melt(
    stimuli,
    id_vars=["phenomenon", "pair_id"],
    value_vars=["spa_eng_grammatical", "spa_eng_ungrammatical",
                "eng_spa_grammatical", "eng_spa_ungrammatical"],
    var_name="condition",
    value_name="sentence"
)

# Creating labels
long_stimuli["label"] = (
    long_stimuli["phenomenon"] + "_" +
    long_stimuli["pair_id"].astype(str) + "_" +
    long_stimuli["condition"]
)

# Showing two columns for MINICONS code
input = long_stimuli[["label", "sentence"]]

# Save to CSV for MINICONS
input.to_csv("input_file.csv", index=False)

Surprisal

In [None]:
from minicons import scorer

input = pd.read_csv(r"C:\Users\lisus\Downloads\input.csv", encoding="utf-8")

# Loading the minicons model
model = scorer.MaskedLMScorer("xlm-roberta-base", device="cpu")

# Extract the sentences from data and store in Python memory
sentences = stimuli.iloc[:, 1].tolist()

# Processing data in chunks
batch_size = 50

# Creating a new list to add processed data to
sequence_scores = []

# Appending each batch to the list in a loop sequence
for i in range(0, len(sentences), batch_size):
    batch = sentences[i:i+batch_size]
    # Computing surprisal scores
    scores = model.sequence_score(batch, reduction=lambda x: -x.sum(0).item())
    sequence_scores.extend(scores)

# Attach results back to your dataframe
input["surprisal"] = sequence_scores

# Saving the results to a new CSV file
input.to_csv("surprisal_output.csv", index=False)

Accuracy Scores

In [None]:
output = pd.read_csv(r"C:\Users\lisus\Downloads\reshaped_surprisal_output.csv", encoding="utf-8")

# Checking whether the model assigns lower surpirsal to the grammatical sentence
def prefers_grammatical(group):
    gram = group.loc[group["grammaticality"] == "grammatical", "surprisal"].values[0]
    ungram = group.loc[group["grammaticality"] == "ungrammatical", "surprisal"].values[0]
    return gram < ungram

# Computing preference per pair
accuracy_scores = output.groupby(
    ["phenomenon", "sentence_number"]
).apply(prefers_grammatical)

# Adding a column for the pair preference
accuracy_scores["prefers_gram"] = output.set_index(
    ["phenomenon", "sentence_number"]
).index.map(preference)

# Computing accuracy per phenomenon
results = output.groupby("phenomenon")["prefers_gram"].mean()
print(results)

In [None]:
# Computing accuracy per codeswitch direction
preference_by_language = output.groupby(
    ["phenomenon", "codeswitch_direction"]
)["prefers_gram"].mean()

print(preference_by_language)

Plot: Surprisal Distributions by Code-Switch Directions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# Boxplot: surprisal by direction and grammaticality
plt.figure(figsize=(10,6))
sns.boxplot(
    data=output,
    x="codeswitch_direction",
    y="surprisal",
    hue="grammaticality"
)
plt.title("Surprisal distributions by code-switch direction")
plt.ylabel("Surprisal")
plt.xlabel("Direction")
plt.show()