# **Installations and imports**

In [None]:
'''
Installations
'''
!pip install pylangacq
!pip install googletrans==3.1.0a0
!pip install fasttext

Collecting pylangacq
  Downloading pylangacq-0.19.1-py3-none-any.whl.metadata (5.9 kB)
Downloading pylangacq-0.19.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pylangacq
Successfully installed pylangacq-0.19.1
Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.11.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecti

In [None]:
'''
Imports
'''
import pylangacq
import numpy as np
import pandas as pd
import os
import csv
from googletrans import Translator
from collections import defaultdict
import copy
import fasttext
from huggingface_hub import hf_hub_download
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt

# **Data pre-processing**

In [None]:
'''
Data pre-processing functions.
Here, stopwords is an adjustable hyperparameter that can be tweaked.
'''

translator = Translator()
stopwords = ['le', 'de', 'un', 'à', 'et', 'la', 'en', 'du', 'des', 'que', 'en', 'van', 'een', 'het', 'in', 'dat', 'op', 'te']

def get_conversation(filepath):
  file_path = filepath
  data = pylangacq.read_chat(file_path)
  words = data.words()
  conversation = []
  sentence = ''
  for i in words:
    i = i.lower()
    if i in stopwords:
      None
    elif '/' in i:
      sentence += ''
    elif ',' in i:
      sentence += ''
    elif '...' in i:
      sentence += ''
    elif i == '.':
      sentence += ''
      conversation.append(sentence)
      sentence = ''
    elif i == '?':
      sentence += ''
      conversation.append(sentence)
      sentence = ''
    elif i == '!':
      sentence += ''
      conversation.append(sentence)
      sentence = ''
    elif sentence == '':
      sentence += i
    elif sentence[-1] == '\'':
      sentence += i
    else:
      sentence += ' '
      sentence += i
  for i in conversation:
    if i == '.':
      conversation.remove(i)
    elif i == '':
      conversation.remove(i)
    elif ' ' not in i:
      conversation.remove(i)
  return conversation

def join_conversations(conversation_list):
  '''
  conversation_list must be a list containing strings that are the pathnames to all the files you want to compress the text data from.
  '''
  data = []
  all_conversations = []
  for i in conversation_list:
    print('loading file ', (i))
    conversation = get_conversation(i)
    all_conversations.append(conversation)
  for i in all_conversations:
    for j in i:
      data.append(j)
  for i in data:
    if i == ' ':
      data.remove(i)
  return data

def create_file(directory_path):
  directory_path = directory_path
  conversation_list = []
  for filename in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, filename)):
      string = str(directory_path + '/' + filename)
      conversation_list.append(string)
  output = join_conversations(conversation_list)
  with open(str(directory_path + '_data.csv'), 'w', newline='') as file:
    writer = csv.writer(file)
    for item in output:
        writer.writerow([item])

def translate_dutch_sentences(sentences):
    """
    Translates each word in a list of Dutch sentences to English.
    """
    translator = Translator()
    translated_sentences = []

    for sentence in sentences:
        try:
            # Split the sentence into words
            words = sentence.split()
            # Translate each word
            translated_words = [
                translator.translate(word, src='nl', dest='en').text.lower()
                for word in words
            ]
            translated_sentences.append(translated_words)
        except Exception as e:
            print(f"Error translating sentence '{sentence}': {e}")
            translated_sentences.append([])  # Append an empty list if translation fails

    return translated_sentences

def translate_french_sentences(sentences):
    """
    Translates each word in a list of French sentences to English.
    """
    translator = Translator()
    translated_sentences = []

    for sentence in sentences:
        try:
            # Split the sentence into words
            words = sentence.split()
            # Translate each word
            translated_words = [
                translator.translate(word, src='fr', dest='en').text.lower()
                for word in words
            ]
            translated_sentences.append(translated_words)
        except Exception as e:
            print(f"Error translating sentence '{sentence}': {e}")
            translated_sentences.append([])  # Append an empty list if translation fails

    return translated_sentences

def create_symbolic_data_dutch(file):
    """
    Reads a CSV file of Dutch sentences, translates each sentence word-by-word to English,
    and creates a DataFrame with the original and translated sentences.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file, header=None, names=["Original"])
    total = len(df)
    print(f"Total sentences: {total}")

    # Translate each sentence
    translated_sentences = []
    for count, sentence in enumerate(df["Original"], start=1):
        print(f"Processing row {count} of {total}")
        if isinstance(sentence, str):
            # Use translate_dutch_sentences to translate the sentence word-by-word
            translated = translate_dutch_sentences([sentence])
            translated_sentence = ' '.join(translated[0]) if translated else ''
        else:
            translated_sentence = ''
        translated_sentences.append(translated_sentence)

    # Add translations to the DataFrame
    df["Translated"] = translated_sentences
    return df

def create_symbolic_data_french(file):
    """
    Reads a CSV file of French sentences, translates each sentence word-by-word to English,
    and creates a DataFrame with the original and translated sentences.
    """
    df = pd.read_csv(file, header=None, names=["Original"])
    total = len(df)
    print(f"Total sentences: {total}")
    translated_sentences = []
    for count, sentence in enumerate(df["Original"], start=1):
        print(f"Processing row {count} of {total}")
        if isinstance(sentence, str):
            translated = translate_french_sentences([sentence])
            translated_sentence = ' '.join(translated[0]) if translated else ''
        else:
            translated_sentence = ''
        translated_sentences.append(translated_sentence)
    df["Translated"] = translated_sentences
    return df

def merge_df(french_csv, dutch_csv):
    """
    Merges two CSV files (French and Dutch dataframes) and shuffles them randomly.
    """
    df1 = pd.read_csv(french_csv)
    df2 = pd.read_csv(dutch_csv)
    merged_df = pd.concat([df1, df2], ignore_index=True)
    merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
    merged_df.to_csv('input_data.csv', index = False)

In [None]:
'''
Creating the dataframe for the dutch and the french sentences and then merging them randomly.
'''
dutch_csv = "dutch_data.csv"
create_file('dutch')  # Converts the Dutch CHILDES data to a CSV file
dutch_df = create_symbolic_data_dutch(dutch_csv)
dutch_df.to_csv("processed_dutch.csv", index=False)

french_csv = "french_data.csv"
create_file('french')  # Converts the French CHILDES data to a CSV file
french_df = create_symbolic_data_french(french_csv)
french_df.to_csv("processed_french.csv", index=False)

merge_df("processed_french.csv", "processed_dutch.csv")

In [None]:
'''
convert input dataframe into "situations" which can be fed into the algorithm
assume input is a pandas dataframe from a csv with a column of the sentences and the second column as the meaning expression
'''
def load_dataframe(path):
  df = pd.read_csv(path)

  #our csv data holds the column names 'Original' for the sentences and 'Translated' for the meaning expressions
  sentences = df['Original'].to_numpy().tolist()
  original = df['Original'].to_numpy().tolist() #for fasttext language detection
  meanings = df['Translated'].to_numpy().tolist() #we take the english translations of the words as meaning expressions


  words = []
  referents = []

  situations = [] #algorithm takes tuples of word-meaning pairings from each sentence
  for sentences, meanings in zip(sentences, meanings):
    sentences = str(sentences).split()
    meanings = str(meanings).split()
    pairings = []
    for word in sentences:
      words.append(word)
      for referent in meanings:
        referents.append(referent)
        pairings.append((word,referent))
    situations.append(pairings)
  words = list(set(words))
  referents = list(set(referents))

  return words, referents, situations, original, sentences, meanings

words, referents, situations, original, sentences, meanings = load_dataframe("/content/input_data.csv") #input path for desired dataframe to be loaded(CSV format)

# **Model architecture**

In [None]:
'''
probabilistic cross-situational word learning algorithm based on fazly et al. (2010)
simplified version in terms of probability updates
'''

import numpy as np
from collections import defaultdict

class CrossSituationalLearner:
    def __init__(self, words, referents, alpha=1.0):
        """
        Initialize the learner with a set of words and referents.

        Parameters:
        - words (list of str): List of possible words in the learning context.
        - referents (list of str): List of possible referents (meanings) in the learning context.
        - alpha (float): Smoothing factor
        """
        self.words = words
        self.referents = referents
        self.alpha = alpha
        self.co_occurrence_counts = defaultdict(lambda: defaultdict(lambda: alpha))
        self.word_referent_probs = defaultdict(lambda: defaultdict(lambda: 1 / len(referents)))



    def update_counts(self, word_referent_pairs):
        """
        Update co-occurrence counts based on observed word-referent pairs in a single situation.

        Parameters:
        - word_referent_pairs (list of tuples): List of (word, referent) pairs observed in a learning situation.
        """
        for word, referent in word_referent_pairs:
            self.co_occurrence_counts[word][referent] += 1



    def update_probabilities(self):
        """
        Update probabilities of word-referent pairs based on accumulated co-occurrence counts.
        """
        for word in self.co_occurrence_counts:
            total_count = sum(self.co_occurrence_counts[word].values())
            for referent in self.co_occurrence_counts[word]:
                self.word_referent_probs[word][referent] = self.co_occurrence_counts[word][referent] / total_count



    def learn_from_situations(self, situations):
        """
        Process multiple learning situations and update word-referent mappings.

        Parameters:
        - situations (list of list of tuples): Each situation is a list of (word, referent) pairs observed together.
        """
        word_learn_acc_over_time = []
        confidence_over_time = []

        i = 0
        for situation in situations:
            print(i)
            i += 1
            if i in range(0,len(situations),1000):
              frenchwords, dutchwords = learner.get_french_and_dutch_words(original)
              intermediate_word_learn_acc = learner.translational_convergence(frenchwords, dutchwords)
              A,P,c = learner.get_comprehension_score(0.1)
              word_learn_acc_over_time.append(intermediate_word_learn_acc)
              confidence_over_time.append(A)
            self.update_counts(situation)
            self.update_probabilities()

        return word_learn_acc_over_time, confidence_over_time


    def learn_from_situations_monolingual(self, situations, language):
        """
        Process multiple learning situations and update word-referent mappings.

        Parameters:
        - situations (list of list of tuples): Each situation is a list of (word, referent) pairs observed together.
        """
        word_learn_acc_over_time = []
        confidence_over_time = []

        i = 0
        for situation in situations:
            print(i)
            i += 1
            if i in range(0,len(situations),1000):
              intermediate_word_learn_acc = learner.translational_convergence_monolingual(language)
              A,P,c = learner.get_comprehension_score_monolingual(0.1, language)
              word_learn_acc_over_time.append(intermediate_word_learn_acc)
              confidence_over_time.append(A)
            self.update_counts(situation)
            self.update_probabilities()

        return word_learn_acc_over_time, confidence_over_time


    def get_most_likely_mapping(self):
        """
        Retrieve the most probable referent for each word based on learned probabilities.

        Returns:
        - dict: A dictionary where keys are words and values are the most likely referent for each word.
        """
        most_likely_mapping = {}
        for word in self.word_referent_probs:
            referent = max(self.word_referent_probs[word], key=self.word_referent_probs[word].get)
            referent_prob = self.word_referent_probs[word][referent]
            most_likely_mapping[word] = referent + " " + str(referent_prob)
        return most_likely_mapping



    def return_learnt_dic(self):
        '''
        returns dictionary of words paired to their meaning distributions
        mainly intended for inspection purposes
        '''
        copyofdic = copy.deepcopy(self.word_referent_probs)
        return copyofdic



    def get_comprehension_score(self, min_con_prob):
        """
        arguments:
        min_con_prob = defines the minimum probability for a word to be considered as learnt

        calculate the comprehension score after a set of input pairs to measure status of convergence/learning state

        calculates the average of the highest meaning probability for each word form

        as well as percentage of words that have been "learned" according to a predefined threshold
        -> we will use 70% based on the paper, but not clear if its fitting for our dataset

        returns two percentages
        1. Average_highest_probs is the average of the most probable meaning represenations for all word forms
        2. Percentage_of_converged_words is the percentage of word forms that were learned over all word forms
        """
        highest_problist = []
        convergence_counter = 0

        for word in self.word_referent_probs:
          if self.word_referent_probs[word] == {}:
            continue
          referent = max(self.word_referent_probs[word], key=self.word_referent_probs[word].get)
          referent_prob = self.word_referent_probs[word][referent]
          highest_problist.append(referent_prob)
          if referent_prob >= min_con_prob:
            convergence_counter += 1

        Average_highest_probs = sum(highest_problist) / len(highest_problist)
        Percentage_of_converged_words = convergence_counter / len(highest_problist)

        return Average_highest_probs, Percentage_of_converged_words, highest_problist



    def get_comprehension_score_monolingual(self, min_con_prob, language):
        """
        arguments:
        min_con_prob = defines the minimum probability for a word to be considered as learnt

        calculate the comprehension score after a set of input pairs to measure status of convergence/learning state

        calculates the average of the highest meaning probability for each word form

        as well as percentage of words that have been "learned" according to a predefined threshold
        -> we will use 70% based on the paper, but not clear if its fitting for our dataset

        returns two percentages
        1. Average_highest_probs is the average of the most probable meaning represenations for all word forms
        2. Percentage_of_converged_words is the percentage of word forms that were learned over all word forms
        """
        highest_problist = []
        convergence_counter = 0

        frenchwords, dutchwords = self.get_french_and_dutch_words(original)
        if language == "fr":
          iterate_mono_words = frenchwords
        elif language == "nl":
          iterate_mono_words = dutchwords

        for word in iterate_mono_words:
          if self.word_referent_probs[word] == {}:
            continue
          referent = max(self.word_referent_probs[word], key=self.word_referent_probs[word].get)
          referent_prob = self.word_referent_probs[word][referent]
          highest_problist.append(referent_prob)
          if referent_prob >= min_con_prob:
            convergence_counter += 1

        Average_highest_probs = sum(highest_problist) / len(highest_problist)
        Percentage_of_converged_words = convergence_counter / len(highest_problist)

        return Average_highest_probs, Percentage_of_converged_words, highest_problist


    def get_french_and_dutch_words(self, original):
        '''
        identifying the language of each word to correctly make use of the translate function
        returns two lists consisting of french and dutch words respectively without repeating words
        arguments:
        - original: original unprocessed utterance sentences from corpus
        '''
        #load fasttext model to determine lang of a given sentence
        model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
        model = fasttext.load_model(model_path) #initiate model

        frenchcorpus = []
        dutchcorpus = []
        p = 0
        l = len(original)

        for sentence in original:
          print(p,"/",l)
          p+=1
          lang = model.predict(str(sentence)) #determine lang of sentence
          if lang[0][0] == '__label__fra_Latn':
            frenchcorpus.append(str(sentence).split())
          elif lang[0][0] == '__label__nld_Latn':
            dutchcorpus.append(str(sentence).split())

        #transform into correct format for further functions
        frenchcorpusnolist = []
        dutchcorpusnolist = []

        for x in frenchcorpus:
          for word in x:
            frenchcorpusnolist.append(word)

        for x in dutchcorpus:
          for word in x:
            dutchcorpusnolist.append(word)

        frenchwords = set(frenchcorpusnolist) #list of all individual french words without repetition
        dutchwords = set(dutchcorpusnolist) #list of all individual dutch words without repetition

        return frenchwords, dutchwords



    def get_translatable_words(self, frenchwords, dutchwords):
        """iterates over all french and dutch words and determines whether the respective translation is available in the wordforms
        arguments:
        -frenchwords: list - set of all frenchwords determined by languagedetection function
        -dutchwords: list - set of all dutch words determined by languagedetection function

        returns list of tuples consisting of words with their translations
        """
        translator = Translator()
        translator.raise_Exception = True
        translatable_words = {}

        l = len(frenchwords) + len(dutchwords)
        p = 0

        for word in frenchwords:
          print(p,"/",l, "words: ", len(translatable_words))
          p += 1
          frtrans = translator.translate(word, src='fr', dest='nl').text.lower()
          if frtrans in dutchwords:
            translatable_words[word] = frtrans

        for word in dutchwords:
          print(p,"/",l, "words: ", len(translatable_words))
          p += 1
          nltrans = translator.translate(word, src='nl', dest='fr').text.lower()
          if nltrans in frenchwords:
            translatable_words[word] = nltrans

        return translatable_words



    def most_similar_pairs(self, topn, minoverlap, translatable_words):
        """
        Compares each word form with all the other word forms based on their meaning probability distributions
        One would expect to see a french word being mapped to its exact dutch translation because
        those should be most similar based on their meaning representations in comparison to other french words
        arguments:
        -topn: number of meaning representations that should be compared to associate words to each other
        -minoverlap: minimum overlap of topn meaning representations to associate words to each other
        -translatable_words: list of words that have their counterpart in dutch/french in the corpus, otherwise it wouldnt be possible to get it right and are therefore left out
        """
        #make workable dict with lists of tuples as values for wordform keys
        #deepcopy of original dict to not change it in the process
        w_f_p_copy = copy.deepcopy(self.word_referent_probs)
        workingdic = {}
        #iterate through dict and get list of tuples of topn most probable meanings
        for word in translatable_words:
          topnlist = []
          for i in range(topn):
            if w_f_p_copy[word] == {}:
              continue
            else:
              referent = max(w_f_p_copy[word], key=w_f_p_copy[word].get) #that way the tuples are already sorted
              referent_prob = w_f_p_copy[word][referent]
              topnlist.append((referent, referent_prob))
              w_f_p_copy[word].pop(referent)
          workingdic[word] = topnlist

        #create list with most similar pairs
        most_similar_pairs = []
        #at first reduce list of potential partners for a word to words that have the minoverlap in its referent list
        for wordform1 in workingdic:
          potentiallist = []
          a = [x[0] for x in workingdic[wordform1]]
          #determine potentially similar words by comparing topn referent overlap at first
          for wordform2 in workingdic:
            if wordform1 == wordform2: #prevent comparing same word
              None
            else:
              b = [x[0] for x in workingdic[wordform2]]
              if len(set(a) & set(b)) >= minoverlap:
                potentiallist.append(wordform2) #if the second word overlaps with the defined min in its topn referents then it is suitable for further calculation of the weighted similarity
          a = [x for x in workingdic[wordform1]] #get tuples to also access probability of each referent(meaning)
          potential_score_list = []
          for wordform2 in potentiallist: #reduced list of potential partners
            wordform2_score = 0
            b = [x for x in workingdic[wordform2]]
            for tup1 in b:
              for tup2 in a:
                if tup1[0] == tup2[0]:
                  wordform2_score += tup1[1] * tup2[1] #multiplication weighs similar probs more than very different ones because its all numbers between 0 and 1 -> 0.8*0.2=0.16 and 0.5*0.5=0.25 but sum of both is 1 so more similar probs are weighted higher
            potential_score_list.append((wordform2, wordform2_score))
          if potential_score_list == []:
            print("attention! overlap is too high in comparison to topn. " + wordform1 + " does not have a potential pairword")
          else:
            most_sim_wordform2 = max(potential_score_list, key=lambda x:x[1])
            most_similar_pairs.append((wordform1, most_sim_wordform2[0]))
        return most_similar_pairs



    def translational_accuracy(self, most_sim_pairs):
        '''
        calculates the percentage of words in each language(french or dutch) which can be translated into their
        correct counterpart using the probability distributions in the dictionary that the learner model produced
        the theory is that a dutch and french word with the same meaning also should have the most similar meaning
        distribution at the top of the most probable meaning expressions and therefore should be possible to be mapped
        to each other
        this is supposed to measure how well suited the meaning distributions are to effectively represent their words in comparison
        to the same words in other languages
        ->if it can be mapped it shows a strong capturing of contextual data
        arguments:
        - most_sim_pairs: list of tuples consisting of pairings of words
        '''

        trans_count = 0
        l = len(most_sim_pairs)
        p = 0

        for pair in most_sim_pairs:
          word = pair[0]
          if pair[1] == translatable_words[word]:
            trans_count += 1
          p += 1
          print(p, "/", l, "processed ", trans_count, "correct")

        trans_acc = trans_count / len(most_sim_pairs)
        return trans_acc



    def translational_convergence(self, frenchwords, dutchwords):
        '''
        calculates how many words converged on their correct meaning representation
        based on a comparison of its most probable meaning expression from its probability distribution
        with its actual meaning expression
        the meaning expressions are ,in our case, the english translations of a word, therefore we take
        the most probable meaning expression from the dictionary that our learner model produced and compare
        it with the direct translation of the word into english (our meaning expressions for the dataset
        have been constructed with the same translator, thus we expect the resulting meaning expressions to be the same)

        '''
        translator = Translator()
        translator.raise_Exception = True
        convergence_counter = 0
        l = len(self.word_referent_probs)
        p = 0

        for word in self.word_referent_probs:
          most_likely_referent = max(self.word_referent_probs[word], key=self.word_referent_probs[word].get)
          if word in frenchwords:
            meaning = translator.translate(word, src='fr', dest='en').text.lower()
            if meaning == most_likely_referent:
              convergence_counter += 1
          elif word in dutchwords:
            meaning = translator.translate(word, src='nl', dest='en').text.lower()
            if meaning == most_likely_referent:
              convergence_counter += 1
          else:
            meaning1 = translator.translate(word, src='fr', dest='en').text.lower()
            meaning2 = translator.translate(word, src='nl', dest='en').text.lower()
            if meaning1 == most_likely_referent or meaning2 == most_likely_referent:
              convergence_counter += 1
          p += 1
          print(p, "/", l, "processed ", convergence_counter, "correct")

        convergence_score = convergence_counter / len(self.word_referent_probs)
        return convergence_score



    def translational_convergence_monolingual(self, language):
        '''
        calculates how many words converged on their correct meaning representation
        based on a comparison of its most probable meaning expression from its probability distribution
        with its actual meaning expression
        the meaning expressions are ,in our case, the english translations of a word, therefore we take
        the most probable meaning expression from the dictionary that our learner model produced and compare
        it with the direct translation of the word into english (our meaning expressions for the dataset
        have been constructed with the same translator, thus we expect the resulting meaning expressions to be the same)

        arguments:
        - language: tag which language is supposed to be translated e.g. "fr" or "nl"
        '''


        translator = Translator()
        translator.raise_Exception = True
        convergence_counter = 0

        frenchwords, dutchwords = self.get_french_and_dutch_words(original)
        if language == "fr":
          iterate_mono_words = frenchwords
        elif language == "nl":
          iterate_mono_words = dutchwords

        l = len(iterate_mono_words)
        p = 0

        for word in iterate_mono_words:
          if self.word_referent_probs[word] == {}:
            continue
          most_likely_referent = max(self.word_referent_probs[word], key=self.word_referent_probs[word].get)
          meaning = translator.translate(word, src=language, dest='en').text.lower()
          if meaning == most_likely_referent:
            convergence_counter += 1
          p += 1
          print(p, "/", l, "processed ", convergence_counter, "correct")

        convergence_score = convergence_counter / len(iterate_mono_words)
        return convergence_score



# Instantiating Model Class object and running code

In [None]:
'''
Bilingual code processing
execute this to receive results when running the model on a bilingual dataset
'''
#define parameters for model
path =                        #place path to bilingual dataframe csv
topn =                        #define number of meaning expressions used to associate dutch and french words to each other(recommendation: 10)
minimum_overlap =             #define minimum overlap in meaning distribution of dutch and french words to associate them to each other(recommendation: 2, should be way smaller than topn)
threshold =                   #define minimum probability assigned to the most probable meaning representation of a word to view it as being learned

words, referents, situations, original, sentences, meanings = load_dataframe(path)

# Create a CrossSituationalLearner instance and learn from situations
learner = CrossSituationalLearner(words, referents)
wordaccovertime, confidenceovertime = learner.learn_from_situations(situations)

#construct sets of french and dutch words for latter functions
frenchwords, dutchwords = learner.get_french_and_dutch_words(original)

#calculate how many words converged on their correct meaning
transcon = learner.translational_convergence(frenchwords, dutchwords)

#filter words for translational accuracy
translatable_words = learner.get_translatable_words(frenchwords, dutchwords)

#construct most similar pairings according to learned probability distributions
most_sim_pairs = learner.most_similar_pairs( topn, minimum_overlap, translatable_words) #define topn and minimum overlap for mapping(check documentation in class definition)

#calculate how many words are paired to their correct translations into french or dutch respectively
transacc = learner.translational_accuracy(most_sim_pairs)
#calculate the average of the highest probability for meaning representations for every word and how many probability go over a predefined threshold
A, P = learner.get_comprehension_score(threshold) #input to function is threshold to be chosen


print(f"""
      Word learning accuracy:
      The percentage of words that have been mapped to their correct meaning representation is: {transcon}
      Comprehension scores:
      The average confidence that the model mapped the correct meaning representation to each word is: {A}
      The percentage of words that exceed a certain probability threshold for their most probable meaning representation is: {P}
      Translational accuracy
      The percentage of words that can be mapped to their translated counterpart in respectively dutch or french based on the learned
      probabilistic meaning representation distribution is: {transacc}
      """)

#graph inspection
# Indices for the x-axis
indices = list(range(1, len(wordaccovertime) + 1))

# Plot
plt.figure(figsize=(8, 5))
plt.plot(indices, wordaccovertime, marker='o', linestyle='-', color='b', label='Word learning accuracy')
plt.plot(indices, confidenceovertime, marker='s', linestyle='--', color='r', label='Average Confidence')
plt.xlabel("x1000 iterations")
plt.ylabel("Probabilities")
plt.title("Plot of Word learning accuracy anf average confidence over time")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
'''
Monolingual code processing
execute this to receive results when running the model on a monolingual dataset for comparison
'''
#define parameters for model
path =                        #place path to monolingual dataframe csv(dutch or french)
threshold =                   #define minimum probability assigned to the most probable meaning representation of a word to view it as being learned
language =                    #define which language is being used in the monolingual dataset ("nl" or "fr")<-very importantd


words, referents, situations, original, sentences, meanings = load_dataframe(path)

# Create a CrossSituationalLearner instance and learn from situations
learner = CrossSituationalLearner(words, referents)
wordaccovertime, confidenceovertime = learner.learn_from_situations_monolingual(situations, language)

#calculate translational convergence for monolingual data
transconmon = learner.translational_convergence_monolingual(language)

#calculate the average of the highest probability for meaning representations for every word and how many probability go over a predefined threshold
A, P = learner.get_comprehension_score_monolingual(threshold, language) #input to function is threshold to be chosen


print(f"""
      Word learning accuracy:
      The percentage of words that have been mapped to their correct meaning representation is: {transconmon}
      Comprehension scores:
      The average confidence that the model mapped the correct meaning representation to each word is: {A}
      The percentage of words that exceed a certain probability threshold for their most probable meaning representation is: {P}
      """)

#graph inspection
# Indices for the x-axis
indices = list(range(1, len(wordaccovertime) + 1))

# Plot
plt.figure(figsize=(8, 5))
plt.plot(indices, wordaccovertime, marker='o', linestyle='-', color='b', label='Word learning accuracy')
plt.plot(indices, confidenceovertime, marker='s', linestyle='--', color='r', label='Average Confidence')
plt.xlabel("x1000 iterations")
plt.ylabel("Probabilities")
plt.title("Plot of Word learning accuracy anf average confidence over time")
plt.legend()
plt.grid(True)
plt.show()

# Statistical analysis of model results

In [None]:
#additional statistical analysis
#chisquared ttest
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest

# Data
bilingual = [2256, 6320]  # [correct, total]
french = [986, 2596]
dutch = [1262, 3085]

# Chi-squared test
# Construct a contingency table
contingency_table = np.array([
    [bilingual[0], bilingual[1] - bilingual[0]],  # [correct, incorrect]
    [french[0], french[1] - french[0]],
    [dutch[0], dutch[1] - dutch[0]],
])

chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-squared test:")
print(f"Chi2 Stat = {chi2_stat}, p-value = {p_value}, Degrees of Freedom = {dof}")

# Pairwise two-proportion z-tests
# Function to perform the z-test
def perform_ztest(success_a, size_a, success_b, size_b):
    count = np.array([success_a, success_b])
    nobs = np.array([size_a, size_b])
    stat, pval = proportions_ztest(count, nobs)
    return stat, pval

print("\nPairwise two-proportion z-tests:")

# Bilingual vs French
z_stat, p_val = perform_ztest(bilingual[0], bilingual[1], french[0], french[1])
print(f"Bilingual vs French: Z-stat = {z_stat}, p-value = {p_val}")

# Bilingual vs Dutch
z_stat, p_val = perform_ztest(bilingual[0], bilingual[1], dutch[0], dutch[1])
print(f"Bilingual vs Dutch: Z-stat = {z_stat}, p-value = {p_val}")

# French vs Dutch
z_stat, p_val = perform_ztest(french[0], french[1], dutch[0], dutch[1])
print(f"French vs Dutch: Z-stat = {z_stat}, p-value = {p_val}")

In [None]:
'''
load and learn all dataframes to gather highest probability list of all cases(bilingual and monolingual:nl and fr)
for statistical analysis below
'''
#bilingual
#load bilingual dataframe
words, referents, situations, original, sentences, meanings = load_dataframe("/content/input_data.csv")
# Create a CrossSituationalLearner instance and learn from situations
learner = CrossSituationalLearner(words, referents)
learner.learn_from_situations(situations)
#get highestproblist from bilingual for statistical analysis below
a,b,bilingual = learner.get_comprehension_score(0.1)


#french
#load french dataframe
words, referents, situations, original, sentences, meanings = load_dataframe("/content/processed_french.csv")
# Create a CrossSituationalLearner instance and learn from situations
learner = CrossSituationalLearner(words, referents)
learner.learn_from_situations(situations)
#get highestproblist from french for statistical analysis below
a,b,french = learner.get_comprehension_score_monolingual(0.1, "fr")


#dutch
#load dutch dataframe
words, referents, situations, original, sentences, meanings = load_dataframe("/content/processed_dutch.csv")
# Create a CrossSituationalLearner instance and learn from situations
learner = CrossSituationalLearner(words, referents)
learner.learn_from_situations(situations)
#get highestproblist from dutch for statistical analysis below
a,b,dutch = learner.get_comprehension_score_monolingual(0.1,"nl")
print(dutch)
print(bilingual)
print(french)

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150

In [None]:
#avg confidence

import scipy.stats as stats


# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(bilingual, french, dutch)

# Output the results
print(f"One-Way ANOVA:")
print(f"F-statistic = {f_stat}, p-value = {p_value}")

# Interpretation
if p_value < 0.05:
    print("The means of the groups are significantly different (reject null hypothesis).")
else:
    print("The means of the groups are not significantly different (fail to reject null hypothesis).")

One-Way ANOVA:
F-statistic = 42.39115212752479, p-value = 7.699400359472245e-19
The means of the groups are significantly different (reject null hypothesis).


In [None]:
#added effect size


# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(bilingual, french, dutch)

# Compute Effect Size for ANOVA (Eta-squared)
groups = [bilingual, french, dutch]
n_groups = len(groups)
grand_mean = np.mean(np.concatenate(groups))
ss_total = sum((x - grand_mean)**2 for group in groups for x in group)
ss_between = sum(len(group) * (np.mean(group) - grand_mean)**2 for group in groups)
eta_squared = ss_between / ss_total

print(f"One-Way ANOVA:")
print(f"F-statistic = {f_stat}, p-value = {p_value}")
print(f"Effect Size (Eta-squared) = {eta_squared}")

# Graphing the results

In [None]:
'''
Graphing the results over time
'''

def plot_confidence_score(list1, list2, list3, label1="List 1", label2="List 2", label3="List 3"):
    """
    Plots a graph for average confidence scores over iterations.

    Args:
        list1, list2, list3: Three lists of floats representing confidence scores.
        label1, label2, label3: Names for the lines corresponding to the lists.
    """
    # Determine the length of the x-axis based on the longest list
    max_length = max(len(list1), len(list2), len(list3))
    x_axis = range(1, max_length + 1)  # Scale x-axis to the length of the longest list

    # Plot the graph
    plt.figure(figsize=(10, 6))
    plt.plot(x_axis[:len(list1)], list1, label=label1, marker='o')
    plt.plot(x_axis[:len(list2)], list2, label=label2, marker='s')
    plt.plot(x_axis[:len(list3)], list3, label=label3, marker='^')
    plt.ylim(0,1)  # Set y-axis scale
    plt.xlabel('Iterations over time x 1000')
    plt.ylabel('Average Confidence Score')
    plt.title('Average Confidence Score vs Iterations')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_word_learning_accuracy(list1, list2, list3, label1="List 1", label2="List 2", label3="List 3"):
    """
    Plots a graph for word learning accuracy over iterations.

    Args:
        list1, list2, list3: Three lists of floats representing word learning accuracy.
        label1, label2, label3: Names for the lines corresponding to the lists.
    """
    # Determine the length of the x-axis based on the longest list
    max_length = max(len(list1), len(list2), len(list3))
    x_axis = range(1, max_length + 1)  # Scale x-axis to the length of the longest list

    # Plot the graph
    plt.figure(figsize=(10, 6))
    plt.plot(x_axis[:len(list1)], list1, label=label1, marker='o')
    plt.plot(x_axis[:len(list2)], list2, label=label2, marker='s')
    plt.plot(x_axis[:len(list3)], list3, label=label3, marker='^')
    plt.ylim(0,1)  # Set y-axis scale
    plt.xlabel('Iterations over time x 1000')
    plt.ylabel('Word Learning Accuracy')
    plt.title('Word Learning Accuracy vs Iterations')
    plt.legend()
    plt.grid(True)
    plt.show()

#attained from above calculations
confidence_list1 = [0.15586921056325656, 0.1528592670578546, 0.15107599105605551, 0.151336575548974, 0.15479930626967395, 0.15920377072046082, 0.1598625380686805, 0.1589173565944861, 0.15700768817241156, 0.1571027135774516]
confidence_list2 = [0.22186634080272258, 0.20054334314797997, 0.20145126234695515, 0.19653642120185344, 0.19511762643661254, 0.19117510608218558, 0.1851703249119688, 0.18346618182012286, 0.18059885291705888, 0.17881453924361274, 0.17789162023197588, 0.17536063658538698, 0.17330474958718678, 0.17407898724565782]
confidence_list3 = [0.19905058513976295, 0.19590945048934788, 0.19262208103312287, 0.19009614372194428, 0.19052562032176845, 0.18815910916190592, 0.18760495184823023, 0.1877544261553584, 0.18780179236971684]

accuracy_list1 = [0.09437596302003082, 0.13906009244992296, 0.187211093990755, 0.2261171032357473, 0.26040061633281975, 0.28582434514637906, 0.31394453004622497, 0.33705701078582434, 0.3578582434514638, 0.3713405238828968]
accuracy_list2 = [0.0813614262560778, 0.12155591572123177, 0.1487844408427877, 0.1880064829821718, 0.21815235008103728, 0.24376012965964344, 0.2680713128038898, 0.29076175040518637, 0.31118314424635335, 0.3293354943273906, 0.34846029173419774, 0.3646677471636953, 0.38055105348460294, 0.3948136142625608]
accuracy_list3 = [0.2666139240506329, 0.3000525486074619, 0.3117476131174761, 0.3218390804597701, 0.3375877472878111, 0.3404255319148936, 0.34316644113667116, 0.3509039979628215, 0.3540658276863504]

# Plot the graphs with custom labels
plot_confidence_score(confidence_list1, confidence_list2, confidence_list3,
                      label1="French", label2="Dutch", label3="Bilingual")
plot_word_learning_accuracy(accuracy_list1, accuracy_list2, accuracy_list3,
                            label1="French", label2="Dutch", label3="Bilingual")
