In [47]:
from collections import Counter
from pyconll.unit.token import Token
from pyconll.unit.sentence import Sentence
import typing

In [62]:
def sort_dic(dic: dict[str, int]) -> dict[str, int]:
    """
    Sorts a dictionary by its values in descending order.
    Inspired by StackOverflow solutions.
    """
    
    dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
    return dic

In [49]:
class Triplet:
    """
    A class representing a linguistic triplet composed of three elements (upos1, dep, upos2) in a sentence.
    
    Attributes:
    - upos1 (str): The Universal Part-of-Speech (upos) tag for the first element.
    - dep (str): The dependency relation between the first and second elements.
    - upos2 (str): The Universal Part-of-Speech (upos) tag for the second element.
    - upos1list (list): A list to store lemmas corresponding to the first element in each occurrence of the triplet.
    - upos2list (list): A list to store lemmas corresponding to the second element in each occurrence of the triplet.
    - upos1counter (Counter): A Counter object to count occurrences of lemmas for the first element.
    - upos2counter (Counter): A Counter object to count occurrences of lemmas for the second element.
    - counter (int): A counter to keep track of the number of occurrences of the triplet.
    - sentences (list): A list to store sentences in which the triplet occurs.
                        A sentence of ith index corresponds to the ith elements of the upos1list and upos2list lists.
    - name (str): A string representing the name of the triplet formed by joining upos1, dep, and upos2.
    """
    
    def __init__(self, upos1:str, dep:str, upos2:str) -> None:
        """
        Constructor method to initialize a Triplet instance.

        Parameters:
        - upos1 (str): The Universal Part-of-Speech (upos) tag for the first element.
        - dep (str): The dependency relation between the first and second elements.
        - upos2 (str): The Universal Part-of-Speech (upos) tag for the second element.

        Returns:
        - None
        """
        
        self.sentences = []
        self.upos1 = upos1
        self.dep = dep
        self.upos2 = upos2
        self.upos1list = []
        self.upos2list = []
        self.upos1counter = Counter()
        self.upos2counter = Counter()
        self.counter = 0
        self.name = "-".join([self.upos1, self.dep, self.upos2])
        
    def update_triplet(self, token1:Token, token2:Token, sentence:str) -> None:
        """
        Updates the triplet with information from tokens and the corresponding sentence.

        Parameters:
        - token1 (Token): An object representing the first token in the triplet.
        - token2 (Token): An object representing the second token in the triplet.
        - sentence (str): An object representing the sentence containing the triplet.

        Returns:
        - None

        Notes:
        - Updates the triplet only if the provided tokens match the expected upos and dep.
        - Adds lemmas, updates counters, and records the sentence if the triplet is successfully updated.
        """
        
        flag = 0
        if token1.upos == self.upos1:
            flag+=1
        else:
            print(f"This is {self.name} Triplet but your 1st token is a {token1.upos}.\nThe Triplet was not updated.")
        if token1.deprel == self.dep:    
            flag+=1
        else: 
            print(f"This is {self.name} Triplet but your 1st token's dependency is {token2.deprel}.\nThe Triplet was not updated.")
        if token2.upos == self.upos2:    
            flag+=1
        else: 
            print(f"This is {self.name} Triplet but your 2nd token is a {token2.upos}.\nThe Triplet was not updated.")
                
        if flag == 3:
            self.upos1list.append(token1.lemma)
            self.upos2list.append(token2.lemma)
            
            self.upos1counter.update([token1.lemma])
            self.upos2counter.update([token2.lemma])
            
            self.sentences.append(sentence.text)
            self.counter+=1
            
    def most_common(self, num:int, upos:str | int) -> list[str]:
        """
        Returns a list of the most common lemmas for the specified upos.

        Parameters:
        - num (int): The number of most common lemmas to retrieve.
        - upos (int or str): The element (1 or 2) or the upos name for which to retrieve the most common lemmas.

        Returns:
        - list: A list of the most common lemmas.

        Notes:
        - Handles errors for invalid upos values.
        """
        
        if num>0:
            commons = []
            if upos == self.upos1 or upos == 1:
                for tuple_pair in self.upos1counter.most_common(num):
                    commons.append(tuple_pair[0])
                return commons
            elif upos == self.upos2 or upos == 2:
                for tuple_pair in self.upos2counter.most_common(num):
                    commons.append(tuple_pair[0])
                return commons
            else:
                print(f"The upos {upos} is not in {self.name} tuple, or there is no such position.")
                
    def related_words(self, word:str, upos:str | int, num:int = 0) -> list[str]:
        """
        Returns a list of related words for the specified lemma and element (1 or 2).

        Parameters:
        - word (str): The lemma for which to find related words.
        - upos (int or str): The element (1 or 2) or the upos name for which to find related words.
        - num (int): The number of related words to retrieve (default is 0 for all).

        Returns:
        - list: A list of related words for the specified lemma and element.

        Notes:
        - Uses a dictionary to count related words and sorts them in descending order.
        - Handles errors for invalid upos values.
        """
        
        related={}
        if upos == self.upos1 or upos == 1:
            for i, lemma in enumerate(self.upos1list):
                if word == lemma:
                    match = self.upos2list[i]
                    if match in related:
                        related[match]+=1
                    else:
                        related[match]=1
        
        if upos == self.upos2 or upos == 2:
            for i, lemma in enumerate(self.upos2list):
                if word == lemma:
                    match = self.upos1list[i]
                    if match in related:
                        related[match]+=1
                    else:
                        related[match]=1
                    
        related = sort_dic(related) # returns a dictionary with descending values.
        all_related = list(related.keys())
        
        # if a specific number of words was asked to return, instead of all of the list
        return all_related[:num] if num > 0 else all_related
    
    def retrieve_sentences(self, word1:str, word2:str) -> list[str]:
        """
        Retrieves sentences where the specified lemmas for elements 1 and 2 occur together.

        Parameters:
        - word1 (str): The lemma for the first element.
        - word2 (str): The lemma for the second element.

        Returns:
        - list: A list of sentences where the specified lemmas for elements 1 and 2 occur together.

        Notes:
        - Retrieves sentences based on matching lemmas for both elements.
        """
        sent = []
        for i in range(self.counter):
            if self.upos1list[i] == word1 and self.upos2list[i] == word2:
                sent.append(self.sentences[i])
        return sent

In [50]:
class Root_triplet(Triplet):
    """
    A subclass of Triplet representing a linguistic triplet with a fixed 'NONE' value for the second element (upos2).

    Attributes:
        - Same as Triplet
    
    """
    def __init__(self, upos1:str, dep:str) -> None:
        """
        Constructor method to initialize a Root_triplet instance.
        Its upos2, upos2list and upos2couter are initiallized as "NONE", None and None, accordingly.

        Parameters:
        - upos1 (str): The Universal Part-of-Speech (upos) tag for the first element.
        - dep (str): The dependency relation between the first and second elements.

        Returns:
        - None
        """
        
        super().__init__(upos1, dep, "NONE")
        self.upos2list = None
        self.upos2counter = None
        
    def update_triplet(self, token:Token, sentence:str) -> None:
        """
        Updates the Root_triplet with information from a token and the corresponding sentence.

        Parameters:
        - token (Token): An object representing the token.
        - sentence (str): An object representing the sentence containing the triplet.

        Returns:
        - None

        Notes:
        - Updates the triplet only if the provided token matches the expected upos and dep.
        - Adds lemmas, updates counters, and records the sentence if the triplet is successfully updated.
        """
        
        flag = 0
        if token.upos == self.upos1:
            flag+=1
        else:
            print(f"This is {self.name} triplet but your 1st token is a {token.upos}.\nThe triplet was not updated.")
        if token.deprel == self.dep:    
            flag+=1
        else: 
            print(f"This is {self.name} triplet but your 1st token's dependency is {token.deprel}.\nThe triplet was not updated.")
        
        if flag == 2:
            self.upos1list.append(token.lemma)
            self.upos1counter.update([token.lemma])
            
            self.counter+=1
            self.sentences.append(sentence.text)
            
    def most_common(self, num:int, upos:int=1) -> list[str]:
        """
        Returns a list of the most common lemmas for single upos in the triplet.

        Parameters:
        - num (int): The number of most common lemmas to retrieve.
        - upos (int or str): The element or the upos name for which to retrieve the most common lemmas.

        Returns:
        - list: A list of the most common lemmas.

        Notes:
        - uses the parent method.
        """
        
        if upos != 1:
            print(f"{self.name} Triplet has only one upos")
        else:
            return super().most_common(num, upos)
    
    def related_words(self, word:str, upos:str | int) -> None:
        """
        Prints a message indicating that Root_triplet does not have related words.

        Parameters:
        - word (str): The lemma for which to find related words.
        - upos (int or str): The element (1) for which to find related words.

        Returns:
        - None
        """
        
        print("This is a root triplet and does not have related words")
        
    def retrieve_sentences(self, word1:str, word2:str):
        """
        Retrieves sentences where the specified lemma for element 1 occurs.

        Parameters:
        - word1 (str): The lemma for the first element.
        - word2 (str): Ignored for Root_triplet.

        Returns:
        - list: A list of sentences where the specified lemma for element 1 occurs.

        Notes:
        - Retrieves sentences based on matching lemmas for the first element.
        """
        
        sent = []
        for i in range(self.counter):
            if self.upos1list[i] == word1:
                sent.append(self.sentences[i])
        return sent

In [51]:
def get_triplet(all_triplets:list[Triplet], token1:Token, token2:Token) -> Triplet:
    """
    Retrieves or creates a Triplet instance based on the provided tokens and updates the list of all_triplets.

    Parameters:
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class.
    - token1 (Token): An object representing the first token.
    - token2 (Token): An object representing the second token.

    Returns:
    - Triplet: An instance of the Triplet class that matches the specified upos and dep values,
               or a new Triplet instance if no match is found.

    Notes:
    - Searches the list of all_triplets for a Triplet instance with the specified upos1, dep, and upos2 values.
    - If a matching Triplet is found, returns that Triplet.
    - If no match is found, creates a new Triplet instance with upos1, dep, and upos2 values from the provided tokens.
    - Appends the new Triplet to the list of all_triplets.
    """
    
    for tpl in all_triplets:
        if tpl.upos1 == token1.upos and tpl.dep == token1.deprel and tpl.upos2 == token2.upos:
            return tpl
    new_tpl = Triplet(token1.upos, token1.deprel, token2.upos)
    all_triplets.append(new_tpl)
    return new_tpl

In [52]:
def get_root_triplet(all_triplets:list[Triplet], token:Token) -> Triplet:
    """
    Retrieves or creates a Root_triplet instance based on the provided token and updates the list of all_triplets.

    Parameters:
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class, including Root_triplet.
    - token (Token): An object representing the token for which to retrieve or create a Root_triplet.

    Returns:
    - Root_triplet: An instance of the Root_triplet class that matches the specified upos1, dep, and upos2 values,
                   or a new Root_triplet instance if no match is found.

    Notes:
    - Searches the list of all_triplets for a Root_triplet instance with the specified upos1, dep, and upos2="NONE".
    - If a matching Root_triplet is found, returns that Root_triplet.
    - If no match is found, creates a new Root_triplet instance with upos1, dep, and upos2="NONE" values from the provided token.
    - Appends the new Root_triplet to the list of all_triplets.
    """
    
    for ts in all_triplets:
        if ts.upos1 == token.upos and ts.dep == token.deprel and ts.upos2 == "NONE":
            return ts
    new_root_ts = Root_triplet(token.upos, token.deprel)
    all_triplets.append(new_root_ts)
    return new_root_ts

In [53]:
def get_tokens(sentence:Sentence) -> list[Token]:
    """
    Retrieves tokens from a sentence object, excluding multi-index tokens.
    
    Parameters:
    - sentence: A Sentence object.

    Returns:
    - List[Token]: A list of Token objects.

    Notes:
    - Iterates through the tokens in the sentence.
    - Excludes tokens with multi-index (those containing '-') since their attributes are set to None.
    - Returns a list of valid Token objects.
    """
        
    tokens = [None]
    for token in sentence:
        if not token.id.__contains__("-"):
            tokens.append(token)
    return tokens

In [54]:
def is_root(token:Token) -> bool:
    """
    Checks if a given Token represents the root of a dependency tree.

    Parameters:
    - token (Token): An object representing a token with dependency tree information.

    Returns:
    - bool: True if the token is the root (head is '0'), False otherwise.
    """
    
    if token.head == '0':
        return True
    else:
        return False

In [55]:
def find_triplet(upos1:str, dep:str, upos2:str, all_triplets:list[Triplet]) -> Triplet:
    """
    Finds and returns a Triplet instance with the specified upos1, dep, and upos2 values.

    Parameters:
    - upos1 (str): The Universal Part-of-Speech (upos) tag for the first element.
    - dep (str): The dependency relation between the first and second elements.
    - upos2 (str): The Universal Part-of-Speech (upos) tag for the second element.
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class.

    Returns:
    - Triplet: The Triplet that matches the specified upos1, dep, and upos2 values.

    Notes:
    - Iterates through the list of all_triplets.
    - Returns the first Triplet instance with matching upos1, dep, and upos2 values.
    - Prints a message if no such triplet exists.
    """
    
    for tpl in all_triplets:
        if tpl.upos1 == upos1 and tpl.dep == dep and tpl.upos2 == upos2:
            return tpl
    print("No such triplet exists.")

In [56]:
# Sort the deprels alphabetically and then by the number of words that each triple contains (i.e. its frequency).
# the minus sign is for the correct desending order of the integers returned by the Counter dictionary value.
# Larger numbers will be first becuase their negatives are lower than those of the smaller ones.

def sort_triplets(triplets:list[Triplet]) -> list[Triplet]:
    """
    Sorts a list of Triplet instances alphabetically by dependency relation (deprel) and then by frequency.

    Parameters:
    - triplets (List[Triplet]): A list containing instances of the Triplet class.

    Returns:
    - List[Triplet]: A sorted list of Triplet instances based on deprel and frequency.

    Notes:
    - Uses the sorted function to sort the list based on two criteria:
        1. Alphabetical order of deprel (ascending).
        2. Frequency of each triplet (descending).
    """
    sorted_list = sorted(triplets, key=lambda x: (x.dep, -x.counter), reverse=False)
    return sorted_list

In [57]:
# It is inefficient because it runs the related_words method for each triplet just for the length of the list that it returns.
# The related_words method has ti be run again later in order to print the related words.

def sort_triplets_new(triplets:list[Triplet], word:str, upos:str) -> list[Triplet]:
    """
    Sorts a list of Triplet instances based on the number of related words for a specified lemma and upos.

    Parameters:
    - triplets (List[Triplet]): A list containing Triplets.
    - word (str): The lemma for which to count related words.
    - upos (str): The Universal Part-of-Speech tag for which to count related words.

    Returns:
    - List[Triplet]: A sorted list of Triplet instances based on the number of related words and dependency relation (deprel).

    Notes:
    - Uses the sorted function to sort the list based on two criteria:
        1. The number of related words for the specified lemma and upos (descending).
        2. Alphabetical order of deprel (ascending).
    - The related_words method is called for each Triplet to count the number of related words.
    - The sorting is performed in reverse order to prioritize higher counts and alphabetical order of deprel.
    """
    
    sorted_list = sorted(triplets, key=lambda x: (-len(x.related_words(word, upos)), x.dep), reverse=False)
    return sorted_list

In [58]:
def print_lists_in_dic(dic:dict) -> None:
    """
    Prints each key-value pair in a dictionary, where values are lists.

    Parameters:
    - dic (dict): A dictionary with keys mapping to lists of values.

    Returns:
    - None

    Notes:
    - Iterates through the key-value pairs in the dictionary.
    - For each key, prints the key and the elements of the corresponding list on separate lines.
    """
    
    for key,values_list in dic.items():
        print(f"{key}:")
        for value in values_list:
            print(f"{value}:\n")

In [59]:
def search_in_upos1(word:str, upos:str, all_triplets:list[Triplet]) -> None:
    """
    Searches for occurrences of a specified lemma in the upos1 position of Triplets with a non-'NONE' upos2.

    Parameters:
    - word (str): The lemma to search for.
    - upos (str): The Universal Part-of-Speech (upos) tag for which to search.
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class.

    Returns:
    - None

    Notes:
    - Iterates through Triplet instances where upos1 matches the specified upos and upos2 is not 'NONE'.
    - Appends Triplet instances containing the specified lemma in upos1 to a list.
    - Sorts the list based on the number of related words and deprel using sort_triplets_new.
    - Prints the count of results and details for up to 5 Triplet instances, including related words and sentences.
    """
    
    word_in_upos1 = []
    
    for tpl in all_triplets:
        if tpl.upos1 == upos and tpl.upos2 != "NONE":
            if word in tpl.upos1list:
                word_in_upos1.append(tpl)
                
    word_in_upos1 = sort_triplets_new(word_in_upos1, word, upos)
    
    print(f"{len(word_in_upos1)} results where {word} is in the 1st position were found.")
    
    for i, tpl in enumerate(word_in_upos1):
        if i<5: # just to limit the printed results
            print(f"upos 1: {tpl.upos1}, upos 2: {tpl.upos2}, deprel: {tpl.dep}")
            related = tpl.related_words(word,1)
            print(related, "\n")
            word_sentences_dic = {}
            for match in related:
                word_sentences_dic[match] = tpl.retrieve_sentences(word, match)
            print_lists_in_dic(word_sentences_dic)

In [60]:
def search_in_upos2(word:str, upos:str, all_triplets:list[Triplet]) -> None:
    """
    Searches for occurrences of a specified lemma in the upos2 position of Triplet instances with a non-'NONE' upos2.

    Parameters:
    - word (str): The lemma to search for.
    - upos (str): The Universal Part-of-Speech (upos) tag for which to search.
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class.

    Returns:
    - None

    Notes:
    - Iterates through Triplet instances where upos2 matches the specified upos and upos2 is not 'NONE'.
    - Appends Triplet instances containing the specified lemma in upos2 to a list.
    - Sorts the list based on the number of related words and deprel using sort_triplets_new.
    - Prints the count of results and details for up to 5 Triplet instances, including related words and sentences.
    """
    
    word_in_upos2 = []
    
    for tpl in all_triplets:
        if tpl.upos2 == upos and tpl.upos2 != "NONE":
            if word in tpl.upos2list:
                word_in_upos2.append(tpl)
                
    word_in_upos2 = sort_triplets_new(word_in_upos2, word, upos)
    
    print(f"{len(word_in_upos2)} results where {word} is in the 2nd position were found.")
    
    for i, tpl in enumerate(word_in_upos2):
        if i<5: # just to limit the printed results
            print(f"upos 1: {tpl.upos1}, upos 2: {tpl.upos2}, deprel: {tpl.dep}")
            related = tpl.related_words(word,2)
            print(related, "\n")
            word_sentences_dic = {}
            for match in related:
                word_sentences_dic[match] = tpl.retrieve_sentences(match, word)
            print_lists_in_dic(word_sentences_dic)

In [61]:
def search(word:str, upos:str, all_triplets:list[Triplet]) -> None:
    """
    Searches for occurrences of a specified lemma in both upos1 and upos2 positions of Triplet instances.

    Parameters:
    - word (str): The lemma to search for.
    - upos (str): The Universal Part-of-Speech (upos) tag for which to search.
    - all_triplets (List[Triplet]): A list containing instances of the Triplet class.

    Returns:
    - None

    Notes:
    - Converts the provided upos to uppercase for consistency.
    - Calls the search_in_upos1 and search_in_upos2 functions to perform the search separately for upos1 and upos2.
    """
    
    upos = upos.upper()
    search_in_upos1(word, upos, all_triplets)
    search_in_upos2(word, upos, all_triplets)