In [1]:
# Code Space for MATH280 Proj. 1

In [2]:
import os

In [3]:
def extract_tones_from_file(file_path):
    """
    Read a text file and extract tones from numbered pinyin syllables.
    Each tone is represented as an integer (1-5) based on the numbered pinyin syllables.
    """
    tones = []
    if not os.path.exists(file_path):
        print(f"File {file_path} not found. Please check the file path and try again.")
        return []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line_tones = [int(word[-1]) for word in line.split() if word[-1].isdigit()]
            tones.append(line_tones)
    return tones

In [4]:
import sage.all as sa
def construct_markov_matrix(tones_list, num_states=5):
    """
    Create a SageMath Markov matrix for a given list of tone sequences.
    The matrix is normalized to represent transition probabilities.
    """
    # Initialize a num_states x num_states matrix with all entries set to zero
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 0)  # SageMath Matrix initialization

    # Count transitions based on tone sequences
    for tones in tones_list:
        for i in range(len(tones) - 1):
            current_tone = tones[i] - 1
            next_tone = tones[i + 1] - 1
            transition_counts[current_tone, next_tone] += 1

    # Normalize the matrix rows to get transition probabilities
    for i in range(num_states):
        row_sum = sum(transition_counts[i, j] for j in range(num_states))
        if row_sum > 0:
            for j in range(num_states):
                transition_counts[i, j] /= row_sum

    return transition_counts

In [5]:
from sage.all import log
def compute_log_likelihood(matrix, test_tones):
    """
    Calculate the log-likelihood of a sequence of tones given a SageMath Markov matrix.
    """
    log_likelihood = 0
    for i in range(len(test_tones) - 1):
        current_state = test_tones[i] - 1
        next_state = test_tones[i + 1] - 1
        probability = matrix[current_state, next_state]
        if probability > 0:
            log_likelihood += log(probability)
        else:
            log_likelihood += float('-inf')  # Log(0) is -infinity
    return log_likelihood

In [6]:
def guess_author(test_tones, matrix_zhu, matrix_du):
    """
    Guess the author based on the log-likelihood of the test tones under each author's Markov matrix.
    """
    likelihood_zhu = compute_log_likelihood(matrix_zhu, test_tones)
    likelihood_du = compute_log_likelihood(matrix_du, test_tones)
    
    if likelihood_zhu > likelihood_du:
        return "Zhu Shuzhen"
    else:
        return "Du Fu"

In [7]:
def display_markov_matrix(matrix):
    """
    Print the Markov matrix in a formatted way.
    """
    print("Markov Matrix (5-tone system):")
    print(matrix)

In [8]:
# Set the file path for the input text
zsz_file = "zsz.txt"  # Ensure that `zsz.txt` is in the same directory as this script.

# Step 1: Extract tones from the `zsz.txt` file
zsz_tones = extract_tones_from_file(zsz_file)
if not zsz_tones:
    print(f"Failed to read tones from {zsz_file}. Exiting the script.")
    exit()

# Step 2: Construct the Markov matrix for Zhu Shuzhen's text
matrix_zhu = construct_markov_matrix(zsz_tones)

# Step 3: Display the matrix
display_markov_matrix(matrix_zhu)

Markov Matrix (5-tone system):
[ 17/57  14/57   3/19  17/57      0]
[  7/23  25/92   7/46   6/23   1/92]
[ 17/62   8/31   5/31  19/62      0]
[  4/13   3/13 17/117 37/117      0]
[     1      0      0      0      0]
