In [9]:
# Code Space for MATH280 Proj. 1

In [10]:
# Import needed python module for file reading
import os

In [11]:
def extract_tones_from_file(file_path):
    """
    Read a text file and extract tones from numbered pinyin syllables.
    Each tone is represented as an integer (1-5) based on the numbered pinyin syllables.
    """
    tones = []
    if not os.path.exists(file_path):
        print(f"File {file_path} not found. Please check the file path and try again.")
        return []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line_tones = [int(word[-1]) for word in line.split() if word[-1].isdigit()]
            tones.append(line_tones)
    return tones

In [12]:
import sage.all as sa
def construct_markov_matrix(tones_list, num_states=5):
    """
    Create a SageMath Markov matrix for a given list of tone sequences.
    The matrix is normalized to represent transition probabilities.
    """
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 0)

    # Count transitions based on tone sequences
    for tones in tones_list:
        for i in range(len(tones) - 1):
            current_tone = tones[i] - 1
            next_tone = tones[i + 1] - 1
            transition_counts[current_tone, next_tone] += 1

    # Normalize the matrix rows to get transition probabilities
    for i in range(num_states):
        row_sum = sum(transition_counts[i, j] for j in range(num_states))
        if row_sum > 0:
            for j in range(num_states):
                transition_counts[i, j] /= row_sum

    return transition_counts

'''
def construct_markov_matrix(tones_list, num_states=5):
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 1)  # Laplace smoothing (start with 1)

    # Count transitions
    for tones in tones_list:
        for i in range(len(tones) - 1):
            current_tone = tones[i] - 1
            next_tone = tones[i + 1] - 1
            transition_counts[current_tone, next_tone] += 1

    # Normalize rows to get probabilities
    for i in range(num_states):
        row_sum = sum(transition_counts[i, j] for j in range(num_states))
        if row_sum > 0:
            for j in range(num_states):
                transition_counts[i, j] /= row_sum

    return transition_counts
'''

In [13]:
from sage.all import log
def compute_log_likelihood(matrix, test_tones):
    """
    Calculate the log-likelihood of a sequence of tones given a SageMath Markov matrix.
    """
    log_likelihood = 0
    for i in range(len(test_tones) - 1):
        current_state = test_tones[i] - 1
        next_state = test_tones[i + 1] - 1
        probability = matrix[current_state, next_state]
        if probability > 0:
            log_likelihood += log(probability)
        else:
            log_likelihood += float('-inf')  # Log(0) is -infinity
    return log_likelihood


In [14]:
def guess_author(test_tones_list, matrix_zhu, matrix_du):
    total_likelihood_zhu = sum(compute_log_likelihood(matrix_zhu, tones) for tones in test_tones_list)
    total_likelihood_du = sum(compute_log_likelihood(matrix_du, tones) for tones in test_tones_list)
    
    print(f"Total log-likelihood for Zhu Shuzhen: {total_likelihood_zhu}")
    print(f"Total log-likelihood for Du Fu: {total_likelihood_du}")

    if total_likelihood_zhu > total_likelihood_du:
        return "Zhu Shuzhen"
    else:
        return "Du Fu"

In [15]:
def display_markov_matrix(matrix):
    """
    Print the Markov matrix in a formatted way.
    """
    print("Markov Matrix (5-tone system):")
    print(matrix)

In [16]:
zsz_file = "zsz.txt"  # Zhu Shuzhen's text
df_file = "df.txt"    # Du Fu's text

# Extract tones from both files
zsz_tones = extract_tones_from_file(zsz_file)
df_tones = extract_tones_from_file(df_file)

# Ensure both files were read correctly
if not zsz_tones or not df_tones:
    print("Failed to read tones from one or both input files. Exiting.")
    exit()

In [17]:
matrix_zhu = construct_markov_matrix(zsz_tones)
matrix_du = construct_markov_matrix(df_tones)

print("Zhu Shuzhen's Markov Matrix:")
print(matrix_zhu)
print("\nDu Fu's Markov Matrix:")
print(matrix_du)

test_tones_zsz = zsz_tones[:3]  # Use the first 3 sequences from Zhu Shuzhen's text
test_tones_df = df_tones[:3]    # Use the first 3 sequences from Du Fu's text

# Predict authorship based on multiple test sequences
print("\nPredicted author for Zhu Shuzhen's test tones: ", guess_author(test_tones_zsz, matrix_zhu, matrix_du))
print("Predicted author for Du Fu's test tones: ", guess_author(test_tones_df, matrix_zhu, matrix_du))

Zhu Shuzhen's Markov Matrix:
[58/191 47/191 29/191 57/191      0]
[53/157 40/157 22/157 42/157      0]
[35/104 27/104   2/13    1/4      0]
[ 11/40   7/25 27/200 31/100      0]
[     0      0      0      0      1]

Du Fu's Markov Matrix:
[52/155 87/310 23/155 73/310      0]
[ 11/35 37/140   7/40 69/280      0]
[54/161 41/161 17/161   7/23      0]
[ 23/79 74/237  14/79 52/237      0]
[     0      0      0      0      1]
Total log-likelihood for Zhu Shuzhen: 2*log(53/157) + 2*log(35/104) + 3*log(58/191) + 3*log(57/191) + log(7/25) + log(11/40) + log(27/104) + log(40/157) + log(47/191) + 2*log(29/191) + log(27/200)
Total log-likelihood for Du Fu: 3*log(52/155) + 2*log(54/161) + 2*log(11/35) + log(74/237) + log(23/79) + log(87/310) + log(37/140) + log(41/161) + 3*log(73/310) + log(14/79) + 2*log(23/155)

Predicted author for Zhu Shuzhen's test tones:  Zhu Shuzhen
Total log-likelihood for Zhu Shuzhen: 2*log(53/157) + 2*log(35/104) + log(31/100) + 3*log(58/191) + 2*log(57/191) + 2*log(11/40)