# DICTIONARY IN REAL-WORLD PROBLEM

## 1. Character Counting

In [None]:
def count_character(word: str) -> dict:
    """
    Count the frequency of each character in a given string.

    Args:
        word (str): The input string to analyze.

    Returns:
        dict: A dictionary where keys are characters from the input string 
              and values are their respective counts. 
              Returns an empty dictionary if the input is empty.
    
    Example:
        >>> count_character("hello")
        {'h': 1, 'e': 1, 'l': 2, 'o': 1}
    """
    # Handle the case where the input string is empty or None
    if not word:
        return {}
    
    # Use a dictionary comprehension:
    # - Convert the string into a set to get unique characters
    # - Count occurrences of each character using str.count()
    return {char: word.count(char) for char in set(word)}
 

In [2]:
# Test 1: Normal word
assert count_character("hello") == {"h": 1, "e": 1, "l": 2, "o": 1}, "Test 1 Failed"

# Test 2: Empty string
assert count_character("") == {}, "Test 2 Failed"

# Test 3: All characters the same
assert count_character("aaa") == {"a": 3}, "Test 3 Failed"

# Test 4: Word with mixed characters
assert count_character("abcabc") == {"a": 2, "b": 2, "c": 2}, "Test 4 Failed"

# Test 5: Word with spaces
assert count_character("a a") == {"a": 2, " ": 1}, "Test 5 Failed"

# Test 6: Word with uppercase and lowercase (case sensitive check)
assert count_character("AaA") == {"A": 2, "a": 1}, "Test 6 Failed"

# Test 7: Word with digits and symbols
assert count_character("a1!a") == {"a": 2, "1": 1, "!": 1}, "Test 7 Failed"

print("✅ All tests passed!")

✅ All tests passed!


## 2. Word Counting (from file)

In [4]:
def preprocess_text(file_path: str = 'assets/P1_data.txt') -> list[str]:
    """
    Preprocess text data from a file.

    Steps:
        1. Read the file content.
        2. Convert all text to lowercase.
        3. Remove periods ('.') and commas (',').
        4. Split the cleaned text into a list of words.

    Args:
        file_path (str, optional): Path to the input text file. 
                                   Defaults to 'assets/P1_data.txt'.

    Returns:
        list[str]: A list of processed words.

    Example:
        If the file contains: "Hello, world. Hello!"
        The function returns: ['hello', 'world', 'hello']
    """
    # Open and read the file content
    with open(file_path, 'r', encoding="utf-8") as f:
        text = f.read()

    # Normalize: lowercase, remove punctuation, and split into words
    return text.lower().replace('.', '').replace(',', '').split()


def count_word(words: list[str]) -> dict[str, int]:
    """
    Count the frequency of each word in a list.

    Args:
        words (list[str]): A list of words.

    Returns:
        dict[str, int]: A dictionary mapping each unique word 
                        to its frequency in the list.
                        Returns an empty dictionary if input is empty.

    Example:
        >>> count_word(["hello", "world", "hello"])
        {'hello': 2, 'world': 1}
    """
    if not words:
        return {}
    
    # Use a dictionary comprehension:
    # - Convert words into a set to get unique words
    # - Count occurrences of each word using str.count()
    return {word: words.count(word) for word in set(words)}

In [5]:
words = preprocess_text() # Get content from input file
print(count_word(words))

{'with': 4, 'a': 7, 'man': 6, 'different': 1, 'usually': 1, 'everything': 1, 'your': 1, 'people': 1, 'value': 1, 'and': 1, 'have': 1, 'become': 2, 'just': 2, 'others': 1, 'small': 1, 'try': 2, 'are': 1, 'lay': 1, 'can': 3, 'conquers': 1, 'opportunity': 1, 'him': 1, 'majority': 1, 'thinking': 1, 'firm': 1, 'other': 1, 'those': 1, 'one': 4, 'foundation': 1, 'in': 4, 'you': 3, 'but': 1, 'if': 1, 'warrior': 1, 'whole': 1, 'thought': 1, 'his': 2, 'looking': 1, 'who': 3, 'will': 2, 'rather': 1, 'problems': 1, 'up': 1, 'what': 1, 'employed': 1, 'morning': 1, 'want': 2, 'thrown': 1, 'courage': 1, 'he': 1, 'it': 2, 'mistakes': 1, 'success': 3, 'we': 3, 'came': 1, 'to': 3, 'from': 1, 'busy': 1, 'them': 1, 'again': 1, 'cannot': 1, 'ready': 1, 'is': 3, 'day': 1, 'comes': 2, 'get': 2, 'successful': 2, 'way': 1, 'mightiest': 1, 'they': 1, 'be': 1, 'at': 1, 'the': 5, 'bricks': 1, 'too': 1, 'when': 2, 'profit': 1, 'life': 2, 'secret': 1, 'solve': 1, 'himself': 1, 'enough': 1, 'not': 1, 'for': 3, 'of':

## 3. N-grams For Author Profiling Problems

In [None]:
from nltk import PorterStemmer, WordNetLemmatizer, word_tokenize, download
import string
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


class NgramProfile:
    """
    A class to build n-gram profiles from input text.
    It preprocesses text using stemming and lemmatization, 
    then extracts n-grams for text analysis.
    """

    def __init__(self):
        """Initialize NLP tools such as stemmer and lemmatizer."""
        self._init_nlp_tools()

    def get_profile(self, text: str, n: int = 2) -> Counter:
        """
        Generate an n-gram profile of the given text.

        Args:
            text (str): Input text to process.
            n (int): Size of the n-grams (default is 2 for bigrams).

        Returns:
            Counter: A frequency dictionary of n-grams.
        """
        tokens = self._process_text(text)
        n_gram_list = self._ngrams(tokens, n)
        profile = Counter(n_gram_list)
        return profile

    def _init_nlp_tools(self):
        """
        Download required NLTK resources and initialize 
        stemmer and lemmatizer for preprocessing.
        """
        download('punkt')
        download('wordnet')

        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def _process_text(self, text: str) -> list[str]:
        """
        Tokenize, lemmatize, and stem the input text.

        Args:
            text (str): Input text string.

        Returns:
            list[str]: A list of processed tokens.
        """
        tokens = word_tokenize(text.lower())
        processed_tokens = []

        for token in tokens:
            # Skip punctuation
            if token in string.punctuation:
                continue

            # Lemmatize and then stem the token
            lemma = self.lemmatizer.lemmatize(token)
            stem = self.stemmer.stem(lemma)
            processed_tokens.append(stem)

        return processed_tokens

    def _ngrams(self, tokens: list[str], n: int = 2) -> list[tuple]:
        """
        Generate n-grams from a list of tokens.

        Args:
            tokens (list[str]): List of preprocessed tokens.
            n (int): Size of the n-grams (default is 2).

        Returns:
            list[tuple]: A list of n-gram tuples.

        Raises:
            ValueError: If n is not a positive integer.
        """
        if n <= 0:
            raise ValueError("n must be a positive integer")
        if len(tokens) < n:
            return []

        return [tuple(tokens[i: i + n]) for i in range(len(tokens) - n + 1)]


class ProfileComparator:
    """
    A class to compare two text profiles using cosine similarity 
    of their n-gram distributions.
    """

    def __init__(self):
        """Initialize the comparator with an NgramProfile generator."""
        self.profile_generator = NgramProfile()

    def compare_authors(self, content1: str, content2: str) -> float:
        """
        Compare two texts by their n-gram profiles.

        Args:
            content1 (str): First text.
            content2 (str): Second text.

        Returns:
            float: Cosine similarity score between the two texts.
        """
        return self._get_cosine_similarity(content1, content2)

    def _get_cosine_similarity(self, text1: str, text2: str) -> float:
        """
        Compute cosine similarity between two texts.

        Args:
            text1 (str): First text string.
            text2 (str): Second text string.

        Returns:
            float: Cosine similarity score.

        Raises:
            ValueError: If one or both texts are missing.
        """
        if not text1 or not text2:
            raise ValueError("Both texts must be provided for comparison.")

        # Get n-gram profiles
        profile1 = self.profile_generator.get_profile(text1)
        profile2 = self.profile_generator.get_profile(text2)

        # Union of all n-grams found in both texts
        all_ngrams = set(profile1.keys()).union(set(profile2.keys()))

        # Build frequency vectors for both profiles
        vec1 = [profile1.get(ngram, 0) for ngram in all_ngrams]
        vec2 = [profile2.get(ngram, 0) for ngram in all_ngrams]

        # Convert to numpy arrays
        vec1 = np.array(vec1).reshape(1, -1)
        vec2 = np.array(vec2).reshape(1, -1)

        # Compute cosine similarity
        return cosine_similarity(vec1, vec2)[0][0]
