## Notes on case.py

The purpose of this notebook is to go over how the different functions and classes in `case.py` work together to tag tokens and characters by their case, starting from the function `get_tc`.  Before beginning, please run the cell below, which contains the script in total.

In [5]:
"""Helpers for recasing.

This module contains the TokenCase enum (representing the outcome variable)
and associated helper functions for case-folding (i.e., applying a casing
to an arbitrary Unicode string)."""


import enum
import unicodedata

from typing import Dict, List, Optional, Tuple


# Casing features at the the Unicode character level.


@enum.unique
class CharCase(enum.IntEnum):
    """Enum for the three character classes."""

    DC = 0  # All non-L Unicode categories ("don't care").
    LOWER = 1  # Unicode category 'Ll'.
    UPPER = 2  # Unicode category 'Lu'.

    def __str__(self):
        return self.name


class UnknownCharCaseError(ValueError):

    pass


def get_cc(nunichar: str) -> CharCase:
    """Computes CharCase for a Unicode character.

    This function computes the CharCase of a Unicode character.

    Args:
        nunichar: A Unicode character whose casing is to be computed.

    Returns:
      The CharCase for the input character.
    """
    catstr = unicodedata.category(nunichar)
    if catstr == "Ll":
        return CharCase.LOWER
    elif catstr == "Lu":
        return CharCase.UPPER
    else:
        return CharCase.DC


def apply_cc(nunichar: str, cc: CharCase) -> str:
    """Applies CharCase to a Unicode character.

    This function applies a CharCase to a Unicode character. Unless CharCase
    is `DC`, this is insensitive to the casing of the input character.

    Args:
        nunichar: A Unicode character to be cased.
        cc: A CharCase indicating the casing to be applied.

    Returns:
        An appropriately-cased Unicode character.

    Raises:
        UnknownCharCaseError.
    """
    if cc == CharCase.LOWER:
        return nunichar.lower()
    elif cc == CharCase.UPPER:
        return nunichar.upper()
    elif cc == CharCase.DC:
        return nunichar
    else:
        raise UnknownCharCaseError(cc)


# Casing features at the word ("token") level.


@enum.unique
class TokenCase(enum.IntEnum):
    """Enum for the five token classes."""

    DC = 0  # [DC]+
    LOWER = 1  # [Ll] ([Ll] | [DC])*
    UPPER = 2  # [Lu] ([Lu] | [DC])* except where bled by title.
    TITLE = 3  # [Lu] ([Ll] | [DC])*
    MIXED = 4  # All others.

    def __str__(self):
        return self.name


class UnknownTokenCaseError(ValueError):

    pass


# Type definitions for mixed-base patterns.


ObligatoryPattern = List[CharCase]
Pattern = Optional[ObligatoryPattern]
MixedPatternTable = Dict[str, ObligatoryPattern]


def get_tc(nunistr: str) -> Tuple[TokenCase, Pattern]:
    """Computes TokenCase for a Unicode string.

    This function computes the TokenCase of a Unicode character.

    Args:
        nunistr: A Unicode string whose casing is to be computed.

    Returns:
        A list consisting of the TokenCase for the input character, and either None
        (representing "n/a") or an iterable of CharCase instances representing the
        specifics of a `MIXED` TokenCase pattern.
    """
    if nunistr.islower():
        return (TokenCase.LOWER, None)
    # If title and upper have a fight, title wins. Arguably, "A" is usually
    # titlecase, not uppercase.
    elif nunistr.istitle():
        return (TokenCase.TITLE, None)
    elif nunistr.isupper():
        return (TokenCase.UPPER, None)
    pattern = [get_cc(nunichr) for nunichr in nunistr]
    if all(tc == CharCase.DC for tc in pattern):
        return (TokenCase.DC, None)
    return (TokenCase.MIXED, pattern)


def apply_tc(nunistr: str, tc: TokenCase, pattern: Pattern = None) -> str:
    """Applies TokenCase to a Unicode string.

    This function applies a TokenCase to a Unicode string. Unless TokenCase is
    `DC`, this is insensitive to the casing of the input string.

    Args:
        nunistr: A Unicode string to be cased.
        tc: A Tokencase indicating the casing to be applied.
        pattern: An iterable of CharCase characters representing the specifics of
           the `MIXED` TokenCase, when the `tc` argument is `MIXED`.

    Returns:
        An appropriately-cased Unicode string.

    Raises:
        UnknownTokenCaseError.
    """
    if tc == TokenCase.DC:
        return nunistr
    elif tc == TokenCase.LOWER:
        return nunistr.lower()
    elif tc == TokenCase.UPPER:
        return nunistr.upper()
    elif tc == TokenCase.TITLE:
        return nunistr.title()
    elif tc == TokenCase.MIXED:
        # Defaults to lowercase if no pattern is provided.
        if pattern is None:
            return nunistr.lower()
        assert pattern
        assert len(nunistr) == len(pattern)
        return "".join(apply_cc(ch, cc) for (ch, cc) in zip(nunistr, pattern))
    raise UnknownTokenCaseError(tc)


### Lines 110-134: `def get_tc(nunistr: str) -> Tuple[TokenCase, Pattern]:`

1. What is the argument of `get_ct`? What type is it?  What does it return? What type is it? 


2.  Take the following strings and pass them as arguments through this function:  'Mary','milk','LOL', and 'LaTeX'.


3.  What are the types of the first and second objects in the returned tuples?  


4.  Which of the strings above returns a list as the second object in the tuple?  What do the elements in that list tell us about the string?  


5.  There is a way to get this function to only return a tag, or 'TokenCase', of a string type, instead of a tuple. See if you can figure out how to do print only the tag of `Mary` by reading the python documentation for `enum`.)  Your expected output should be 'TITLE'.  


In [6]:
def get_tc(nunistr: str) -> Tuple[TokenCase, Pattern]:
    """Computes TokenCase for a Unicode string.

    This function computes the TokenCase of a Unicode character.

    Args:
        nunistr: A Unicode string whose casing is to be computed.

    Returns:
        A list consisting of the TokenCase for the input character, and either None
        (representing "n/a") or an iterable of CharCase instances representing the
        specifics of a `MIXED` TokenCase pattern.
    """
    if nunistr.islower():
        return (TokenCase.LOWER, None)
    # If title and upper have a fight, title wins. Arguably, "A" is usually
    # titlecase, not uppercase.
    elif nunistr.istitle():
        return (TokenCase.TITLE, None)
    elif nunistr.isupper():
        return (TokenCase.UPPER, None)
    pattern = [get_cc(nunichr) for nunichr in nunistr]
    if all(tc == CharCase.DC for tc in pattern):
        return (TokenCase.DC, None)
    return (TokenCase.MIXED, pattern)

### Lines 83-94:   `class TokenCase(enum.IntEnum):`

6.  If you haven't figured out the answer to question 5 yet, remind yourself of the type of `get_tc`'s first tuple object by running, say, `type(get_tc("Mary")[0])`.  How is this object related to the snippet of code below? 


7.  Returning to `get_tc` for a moment... In your own words, describe what this if-statement is requesting: 

             if nunistr.islower():
                return (TokenCase.LOWER, None)
                
8.  Revisit the `enum` documentation.  What is the purpose of the line `@enum.unique`?

In [56]:
@enum.unique
class TokenCase(enum.IntEnum):
    """Enum for the five token classes."""

    DC = 0  # [DC]+
    LOWER = 1  # [Ll] ([Ll] | [DC])*
    UPPER = 2  # [Lu] ([Lu] | [DC])* except where bled by title.
    TITLE = 3  # [Lu] ([Ll] | [DC])*
    MIXED = 4  # All others.

    def __str__(self):
        return self.name

### Lines 34-51:   `def get_cc(nunichar: str) -> CharCase:`

8.  What is the argument of `get_cc`? What type is it? What does it return? What type is it? 


9.  Run the following arguments through `get_cc`:  'L', 'a', ','.


10.  Print only the string version of the CharCase tag of `get_tc('L')`.  Your expected output should be 'UPPER'.


11.  Which kinds of strings return the object <CharCase.DC>? (See line 21 in `case.py` to see what 'DC' stands for.)

  
10.  Read the python documentation for 'unicodedata', one of the imported libraries for this script.  And more generally, read about 'unicode' characters here:  https://docs.python.org/3/howto/unicode.html.  Why does the argument have to be a "Unicode character"? 




In [24]:
def get_cc(nunichar: str) -> CharCase:
    """Computes CharCase for a Unicode character.

    This function computes the CharCase of a Unicode character.

    Args:
        nunichar: A Unicode character whose casing is to be computed.

    Returns:
      The CharCase for the input character.
    """
    catstr = unicodedata.category(nunichar)
    if catstr == "Ll":
        return CharCase.LOWER
    elif catstr == "Lu":
        return CharCase.UPPER
    else:
        return CharCase.DC

### Lines 54-77:   `def apply_cc(nunichar: str) -> CharCase:`

11.  What is the argument of `apply_cc`? What type is it? What does it return? What type is it? 


12.  Apply CharCase.UPPER to the following strings:  'L', 'a'.  Your expected output should be:  'L', 'A'.  (HINT:  Your second argument is an object returned by `get_tc`.)  


13.  Repeat 12 with CharCase.LOWER.  Your expected output should be 'l', 'a'.  


14.  Write a snippet of code that iterates through the characters in 'latex' and applies the CharCases needed to get the output, 'LaTeX'. (TIP:  You can use the zip() function to write a for-loop that iterates through two lists at the same time.)

In [23]:
def apply_cc(nunichar: str, cc: CharCase) -> str:
    """Applies CharCase to a Unicode character.

    This function applies a CharCase to a Unicode character. Unless CharCase
    is `DC`, this is insensitive to the casing of the input character.

    Args:
        nunichar: A Unicode character to be cased.
        cc: A CharCase indicating the casing to be applied.

    Returns:
        An appropriately-cased Unicode character.

    Raises:
        UnknownCharCaseError.
    """
    if cc == CharCase.LOWER:
        return nunichar.lower()
    elif cc == CharCase.UPPER:
        return nunichar.upper()
    elif cc == CharCase.DC:
        return nunichar
    else:
        raise UnknownCharCaseError(cc)

In [59]:
# A basic solution for 14.  

cased_chars = []
for char, cased_char in zip('latex', 'LaTeX'): 
    cased_char = apply_cc(char, get_cc(cased_char))
    cased_chars.append(cased_char)

print(''.join(cased_chars))

# A more concise solution for 14, which is adapted from lines 131 and 169 of the script.

pattern = [get_cc(nunichr) for nunichr in nunistr]
print("".join(apply_cc(ch, cc) for (ch, cc) in zip('latex', pattern)))

LaTeX
LaTeX


### Lines 137-170:  `apply_tc(nunistr: str, tc: TokenCase, pattern: Pattern = None) -> str:`


15.  What are the argument of `apply_tc`? What types are they? What does it return? What type is it? 


16.  Apply TokenCase.LOWER to the following strings, 'Mr.', 'apple' 'LaTeX'.  Your expected output should be 'mr.', 'apple', 'latex'.   (HINT:  You will need to use `get_tc` to create the second argument of this function.)


17.  Do the same for TokenCase.UPPER.  Your expected output should be 'MR.', 'APPLE', 'LATEX'. 


18.  Do the same for TokenCase.TITLE.  Your expected output should be 'Mr.', 'Apple', 'Latex'. 


19.  Apply TokenCase.MIXED to 'latex' so you get the output 'LaTeX'. 

In [None]:
def apply_tc(nunistr: str, tc: TokenCase, pattern: Pattern = None) -> str:
    """Applies TokenCase to a Unicode string.

    This function applies a TokenCase to a Unicode string. Unless TokenCase is
    `DC`, this is insensitive to the casing of the input string.

    Args:
        nunistr: A Unicode string to be cased.
        tc: A Tokencase indicating the casing to be applied.
        pattern: An iterable of CharCase characters representing the specifics of
           the `MIXED` TokenCase, when the `tc` argument is `MIXED`.

    Returns:
        An appropriately-cased Unicode string.

    Raises:
        UnknownTokenCaseError.
    """
    if tc == TokenCase.DC:
        return nunistr
    elif tc == TokenCase.LOWER:
        return nunistr.lower()
    elif tc == TokenCase.UPPER:
        return nunistr.upper()
    elif tc == TokenCase.TITLE:
        return nunistr.title()
    elif tc == TokenCase.MIXED:
        # Defaults to lowercase if no pattern is provided.
        if pattern is None:
            return nunistr.lower()
        assert pattern
        assert len(nunistr) == len(pattern)
        return "".join(apply_cc(ch, cc) for (ch, cc) in zip(nunistr, pattern))
    raise UnknownTokenCaseError(tc)

In [77]:
apply_tc('latex', TokenCase.MIXED, [CharCase.UPPER,
 CharCase.LOWER,
 CharCase.UPPER,
 CharCase.LOWER,
 CharCase.UPPER])

'LaTeX'

In [153]:
apply_tc('latex', get_tc('LaTeX'))

UnknownTokenCaseError: (<TokenCase.MIXED: 4>, [<CharCase.UPPER: 2>, <CharCase.LOWER: 1>, <CharCase.UPPER: 2>, <CharCase.LOWER: 1>, <CharCase.UPPER: 2>])

In [149]:
apply_tc('latex', (get_tc('LaTeX')[0].value, [cc.value for cc in get_tc('LaTeX')[-1]])) 

UnknownTokenCaseError: (4, [2, 1, 2, 1, 2])

In [150]:
apply_tc('latex', (get_tc("LaTeX")[0].name, [cc.name for cc in get_tc('LaTeX')[-1]]))

UnknownTokenCaseError: ('MIXED', ['UPPER', 'LOWER', 'UPPER', 'LOWER', 'UPPER'])