# Pyrus

## Modules

### STRING MANAGER

This module provides a series of operations that can be used to manage string data.

#### string_normaliser

This function normalises string data by performing several operations:
- Converting all characters to lowercase
- Removing leading/trailing white space
- Removing double spaces
- Replacing accented and special characters with their ASCII equivalents

Args:
    string (str): The string to be normalised.

Returns:
    normalised_string (str): The normalized string data.

In [1]:
from unidecode import unidecode
import unicodedata
import regex as re
import string

whitespace = string.whitespace

def string_normaliser(string, normalise_encoding=False):

    
    # Turn the entire string to lowercase
    lowercase_string = string.lower()

    if normalise_encoding:
        # Transform string into canonical representation
        unicode_normalised_string = unicodedata.normalize(
            'NFKD', lowercase_string)

        # Encode string into ASCII format,
        # Ignore letters that can't be turned into ASCII
        lowercase_string = unidecode(unicode_normalised_string)
    
    replaced_string = lowercase_string.replace('@', '').replace('#', '')
    # Strip the string of all whitespace
    stripped_string = replaced_string.strip()
    stripped_string = stripped_string.replace(whitespace, '')
    

    # Remove special characters from the string
    pattern = re.compile(r'[^\p{L}\s\d@#]')
    special_character_removed_string = pattern.sub('', stripped_string)

    
    # While the string contains double spaces
    # This ensures triple and more spaces are replaced
    while '  ' in special_character_removed_string:
        # Turn double spaces into single spaces
       special_character_removed_string = special_character_removed_string.replace('  ', ' ')

    normalised_string = special_character_removed_string

    return normalised_string

##### string_normaliser: tests

In [2]:
import unittest

class StringNormaliserTests(unittest.TestCase):

    def subtester(self, test_values):
        
        for value, expected_result in test_values:
            with self.subTest(value=value):
                result = string_normaliser(value)
                self.assertEqual(result, expected_result)
    
    def test_case_normalisation(self):
        test_values = [
            ("Hello World!", "hello world"),
            ("ThIs Is A MiXeD CaSe StRiNg", "this is a mixed case string"),
            ("Áccéntéd Cháráctérs", "áccéntéd cháráctérs"),
            ("ALL UPPERCASE", "all uppercase"),
            ("", ""),  # Empty string should remain the same
        ]
    
        self.subtester(test_values)

    def test_whitespace_normalisation(self):
        test_values = [
            ("   Remove  extra  spaces   ", "remove extra spaces"),
            ("  Leading and trailing spaces  ", "leading and trailing spaces"),
            ("    ", ""),  # All whitespace, expect empty string
            ("", "")
        ]
        
        self.subtester(test_values)

    def test_double_space_normalisation(self):
        test_values = [
            ("This  has  double  spaces", "this has double spaces"),
            ("No  double  spaces", "no double spaces"),
            ("Single spaces", "single spaces"),
            ("", ""),  # Empty string should remain the same
        ]
        
        self.subtester(test_values)

    def test_encoding_normalisation(self):
        test_values = [
            ("Thís Štríng Hás Áccénted Characters", "thís štríng hás áccénted characters"),
            ("Ünicöde Äscii Êncoding", "ünicöde äscii êncoding"),
            ("Keep 1234567890 digits", "keep 1234567890 digits"),
            ("", ""),  # Empty string should remain the same
        ]
        
        self.subtester(test_values)

    def test_special_char_normalisation(self):
        test_values = [
            ("Hello@# World!", "hello world"),
            ("Remove !@#$ special %^&* characters", "remove special characters"),
            ("Keep digits 1234567890", "keep digits 1234567890"),
            ("", ""),  # Empty string should remain the same
        ]
        
        self.subtester(test_values)

    def test_combined_normaisation(self):
        test_values = [
            # Normal test values
            ("Hello World!", "hello world"),
            ("   Remove  extra  spaces   ", "remove extra spaces"),
            ("Thís Štríng Hás Áccénted Characters", "thís štríng hás áccénted characters"),
            ("Ünicöde Äscii Êncoding", "ünicöde äscii êncoding"),
            ("Hello@# World!", "hello world"),
            ("Remove !@#$ special %^&* characters", "remove special characters"),
            ("Keep digits 1234567890", "keep digits 1234567890"),

            # Extreme test values
            ("    ", ""),  # All whitespace, expect empty string
            ("!@#$%^&*()_+", ""),  # All special characters, expect empty string
            ("ÁČÇÈÑTÉÐ ßÞÉÇÏÀL ÇHÁRÁÇTÉRS", "áčçèñtéð ßþéçïàl çháráçtérs"),
            ("ÛÑÎÇØÐÊ ÄŠÇÏÏ ÊÑÇØÐÏÑG", "ûñîçøðê äšçïï êñçøðïñg"),
            ("     ÛÑÎÇØÐÊ   ", "ûñîçøðê"),  # Leading/trailing whitespace with accented characters
            ("\n    ÛÑÎÇØÐÊ     ÄŠÇÏÏ     ÊÑÇØÐÏÑG    ", "ûñîçøðê äšçïï êñçøðïñg"),  # Multiple operations with accented characters and whitespace
        ]
        
        self.subtester(test_values)


unittest.main(argv=[''], exit=False)

......
----------------------------------------------------------------------
Ran 6 tests in 0.012s

OK


<unittest.main.TestProgram at 0x1e8091f58d0>

### Presenter

#### Presneter - Articles, Conjunctions and Prepositions

Conventions on Articles, Conjunctions, Prepositions:

Group 1: Articles, conjunctions, and prepositions are usually not capitalized in titles, unless they are the first word or part of a proper noun.

French

Group 2: Articles, conjunctions, and prepositions are typically not capitalized unless they are the first or last word, or if they have four or more letters.

German
Dutch

Group 3: Articles, conjunctions, and prepositions are generally not capitalized in titles, unless they are the first or last word, or if they are stressed as part of the title's style or emphasis.

Spanish

Group 4: Articles, conjunctions, and prepositions are typically not capitalized in titles, except when they are the first or last word, or if they have special emphasis or are part of proper nouns.

Italian
Portuguese

Group 5: Articles, conjunctions, and prepositions are generally not capitalized in titles unless they are the first or last word, or if they have special emphasis or are part of proper nouns.

Norwegian
Swedish
Danish
Finnish

## Librarian

In [7]:
from contextlib import contextmanager
from collections import ChainMap
import csv


class Librarian:
    LIBRARY = 'library/'

    def __init__(self):
        pass

    @contextmanager
    def get_collection(self, collection):

        file = None

        try:
            file = open(collection, 'r', newline='', encoding='utf-8-sig')
            yield file

        except Exception as e:
            raise e

        finally:
            if file:
                file.close()

    def lookup(self, collection: str, *keys: str):

        collection_filepath = f'{self.LIBRARY}{collection}.csv'

        with self.get_collection(collection_filepath) as file:
            open_file = csv.reader(file)
            data_dict = self.create_dict(open_file)
            result_map = ChainMap(data_dict)

            for key in keys:
                if key in result_map:
                    result_map = result_map[key]

            return result_map
    
    def create_dict(self, open_file):
        
        data_dict = {}
        
        headers = next(open_file)
        for header in headers:
            data_dict[header] = []  # Initialize empty lists for each header

        for row in open_file:
            for header, value in zip(headers, row):
                data_dict[header].append(value)

        return data_dict
    
    def create_datamap(self, open_file):
        
    def extract_data_from_csv(reader):
        rows = list(reader)
        parent_keys = rows[0]
        child_dict = {}

        for row in rows[1:]:
            child_key = row[0]
            child_value = row[1]
            child_dict[child_key] = child_value

        parent_dict = dict(zip(parent_keys, [child_dict] * len(parent_keys)))
        data_map = ChainMap(parent_dict, child_dict)
        return data_map

librarian = Librarian()

print(librarian.lookup('articles_conjunctions_prepositions', 'language'))

['da', 'de', 'en', 'es', 'fi', 'fr', 'it', 'nl', 'no', 'pt', 'sv']


## Data Validator