In [10]:
### Code Space for MATH280 Proj. 1 ###

In [11]:
# Import needed python module for file reading
import os

In [12]:
def extract_tones_from_file(file_path):
    """
    Read a text file and extract tones from numbered pinyin syllables.
    Each tone is represented as an integer (1-5) based on numbered pinyin syllables.
    """
    tones = []
    if not os.path.exists(file_path):
        print(f"File {file_path} not found. Please check the file path and try again.")
        return []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line_tones = [int(word[-1]) for word in line.split() if word[-1].isdigit()]
            tones.append(line_tones)
    return tones

In [13]:
import sage.all as sa
def construct_markov_matrix(tones_list, num_states=5):
    """
    Create a SageMath Markov matrix for a given list of tone sequences.
    NOTE: The matrix is normalized so that each row sums to 1, which is a bit differ to out definition in the class
    """
    # Initialize with zeros
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 0)  

    for tones in tones_list:
        for i in range(len(tones) - 1):
            current_tone = tones[i] - 1  # Adjusting to 0-based index
            next_tone = tones[i + 1] - 1  # Adjusting to 0-based index
            transition_counts[current_tone, next_tone] += 1

    for i in range(num_states):
        row_sum = sum(transition_counts[i, j] for j in range(num_states))
        if row_sum > 0:
            for j in range(num_states):
                transition_counts[i, j] /= row_sum  # Normalize the row

    return transition_counts

# Refined Version adding Laplace Smoothing for better performance in prediction
'''
def construct_markov_matrix(tones_list, num_states=5):
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 1)  # Laplace smoothing (start with 1)

    # Initialize with zeros
    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 0)  

    for tones in tones_list:
        for i in range(len(tones) - 1):
            current_tone = tones[i] - 1  # Adjusting to 0-based index
            next_tone = tones[i + 1] - 1  # Adjusting to 0-based index
            transition_counts[current_tone, next_tone] += 1

    for i in range(num_states):
        row_sum = sum(transition_counts[i, j] for j in range(num_states))
        if row_sum > 0:
            for j in range(num_states):
                transition_counts[i, j] /= row_sum  # Normalize the row

    return transition_counts
'''

'\ndef construct_markov_matrix(tones_list, num_states=5):\n    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 1)  # Laplace smoothing (start with 1)\n\n    # Initialize with zeros\n    transition_counts = sa.Matrix(sa.SR, num_states, num_states, 0)  \n\n    for tones in tones_list:\n        for i in range(len(tones) - 1):\n            current_tone = tones[i] - 1  # Adjusting to 0-based index\n            next_tone = tones[i + 1] - 1  # Adjusting to 0-based index\n            transition_counts[current_tone, next_tone] += 1\n\n    for i in range(num_states):\n        row_sum = sum(transition_counts[i, j] for j in range(num_states))\n        if row_sum > 0:\n            for j in range(num_states):\n                transition_counts[i, j] /= row_sum  # Normalize the row\n\n    return transition_counts\n'

In [14]:
from sage.all import log
def compute_log_likelihood(matrix, test_tones):
    """
    Calculate the log-likelihood of a sequence of tones given a SageMath Markov matrix.
    """
    from sage.all import log  # SageMath log function
    log_likelihood = 0
    for i in range(len(test_tones) - 1):
        current_state = test_tones[i] - 1
        next_state = test_tones[i + 1] - 1
        probability = matrix[current_state, next_state]
        if probability > 0:
            log_likelihood += log(probability)
        else:
            log_likelihood += float('-inf')  # Log(0) is -infinity
    return log_likelihood


In [15]:
def guess_author(test_tones_list, matrix_zhu, matrix_du):
    total_likelihood_zhu = sum(compute_log_likelihood(matrix_zhu, tones) for tones in test_tones_list)
    total_likelihood_du = sum(compute_log_likelihood(matrix_du, tones) for tones in test_tones_list)
    
    print(f"Total log-likelihood for Zhu Shuzhen: {total_likelihood_zhu}")
    print(f"Total log-likelihood for Du Fu: {total_likelihood_du}")

    if total_likelihood_zhu > total_likelihood_du:
        return "Zhu Shuzhen"
    else:
        return "Du Fu"

In [16]:
def display_markov_matrix(matrix):
    """
    Print the Markov matrix in formatted way.
    """
    print("Markov Matrix (5-tone system):")
    print(matrix)

In [17]:
zsz_file = "zsz.txt"  # Zhu Shuzhen's text
df_file = "df.txt"    # Du Fu's text

zsz_tones = extract_tones_from_file(zsz_file)
df_tones = extract_tones_from_file(df_file)

if not zsz_tones or not df_tones:
    print("Failed to read tones from one or both input files. Exiting.")
    exit()

In [18]:
matrix_zhu = construct_markov_matrix(zsz_tones)
matrix_du = construct_markov_matrix(df_tones)

print("Zhu Shuzhen's Markov Matrix:")
print(matrix_zhu)
print("\nDu Fu's Markov Matrix:")
print(matrix_du)

test_tones_zsz = zsz_tones[:3]  # Use the first 3 sequences from Zhu Shuzhen's text
test_tones_df = df_tones[:3]    # Use the first 3 sequences from Du Fu's text

# Predict authorship based on multiple test sequences
print("\nPredicted author for Zhu Shuzhen's test tones: ", guess_author(test_tones_zsz, matrix_zhu, matrix_du))
print("Predicted author for Du Fu's test tones: ", guess_author(test_tones_df, matrix_zhu, matrix_du))

Zhu Shuzhen's Markov Matrix:
[  3/10 47/190 29/190   3/10      0]
[53/156    1/4  11/78   7/26      0]
[35/103 27/103 15/103 26/103      0]
[55/199 56/199 27/199 61/199      0]
[     0      0      0      0      0]

Du Fu's Markov Matrix:
[   1/3 29/103 46/309 73/309      0]
[88/279 73/279 49/279  23/93      0]
[ 27/80 41/160   1/10 49/160      0]
[69/236 37/118 21/118 51/236      0]
[     0      0      0      0      0]
Total log-likelihood for Zhu Shuzhen: 2*log(35/103) + 2*log(53/156) + 6*log(3/10) + log(56/199) + log(55/199) + log(27/103) + log(1/4) + log(47/190) + 2*log(29/190) + log(27/199)
Total log-likelihood for Du Fu: 2*log(27/80) + 3*log(1/3) + 2*log(88/279) + log(37/118) + log(69/236) + log(29/103) + log(73/279) + log(41/160) + 3*log(73/309) + log(21/118) + 2*log(46/309)

Predicted author for Zhu Shuzhen's test tones:  Zhu Shuzhen
Total log-likelihood for Zhu Shuzhen: 2*log(35/103) + 2*log(53/156) + log(61/199) + 5*log(3/10) + 2*log(55/199) + log(7/26) + 4*log(47/190) + log(1