In [36]:
!gdown 1BheorA2H5R_KfC0nykchyaUNZtpJHW1o

Downloading...
From: https://drive.google.com/uc?id=1BheorA2H5R_KfC0nykchyaUNZtpJHW1o
To: /content/data.pdf
  0% 0.00/22.9k [00:00<?, ?B/s]100% 22.9k/22.9k [00:00<00:00, 51.8MB/s]


In [37]:
!pip install pymupdf pandas




In [38]:
import fitz  # PyMuPDF
import pandas as pd

pdf_file = "/content/data.pdf"

def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    extracted_text = ""
    for page in doc:
        extracted_text += page.get_text() + "\n"
    return extracted_text

def parse_text_to_dataframe(text):
    rows = text.strip().split("\n")
    data = [row.split(",") for row in rows if row.count(",") == 3]
    headers = data[0]
    rows_data = data[1:]
    df = pd.DataFrame(rows_data, columns=headers)
    df["StudentID"] = df["StudentID"].astype(int)
    df["Day"] = df["Day"].astype(int)
    return df

def calculate_hmm_matrices(df):
    # Initial Probability Distribution
    initial_counts = df[df["Day"] == 1]["Mood"].value_counts()
    total_students = len(df["StudentID"].unique())
    initial_prob = {mood: (count) / (total_students) for mood, count in initial_counts.items()}

    # Transition Matrix
    transition_pairs = []
    for student_id, group in df.groupby("StudentID"):
        moods = group["Mood"].tolist()
        transition_pairs.extend(zip(moods[:-1], moods[1:]))

    transition_counts = pd.Series(transition_pairs).value_counts()
    transition_totals = df["Mood"].value_counts()
    states = ["H", "S"]

    transition_matrix = {
        (m1, m2): (transition_counts.get((m1, m2), 0) + 1) / (transition_totals.get(m1, 0) + len(states))
        for m1 in states for m2 in states
    }

    # Emission Matrix
    emission_counts = df.groupby(["Mood", "ShirtColor"]).size()
    mood_totals = df["Mood"].value_counts()
    colors = ["R", "G", "B"]

    emission_matrix = {
        (mood, color): (emission_counts.get((mood, color), 0) + 1) / (mood_totals.get(mood, 0) + len(colors))
        for mood in states for color in colors
    }

    return initial_prob, transition_matrix, emission_matrix

def calculate_sequence_probability(sequence, colors, initial_prob, transition_matrix, emission_matrix):
    prob = initial_prob.get(sequence[0], 0) * emission_matrix.get((sequence[0], colors[0]), 0)
    if prob == 0:
        return 0

    for i in range(1, len(sequence)):
        transition_prob = transition_matrix.get((sequence[i - 1], sequence[i]), 0)
        emission_prob = emission_matrix.get((sequence[i], colors[i]), 0)

        if transition_prob == 0 or emission_prob == 0:
            return 0

        prob *= transition_prob * emission_prob

    return prob

text = extract_text_from_pdf(pdf_file)
df = parse_text_to_dataframe(text)

initial_prob, transition_matrix, emission_matrix = calculate_hmm_matrices(df)

print("Initial Probability Distribution:")
print(initial_prob)

print("\nTransition Matrix:")
print(transition_matrix)

print("\nEmission Matrix:")
print(emission_matrix)

observed_colors = ["R", "B", "G"]
possible_sequences = [
    ("H", "H", "H"), ("H", "H", "S"), ("H", "S", "H"), ("H", "S", "S"),
    ("S", "H", "H"), ("S", "H", "S"), ("S", "S", "H"), ("S", "S", "S"),
]

sequence_probabilities = {
    seq: calculate_sequence_probability(seq, observed_colors, initial_prob, transition_matrix, emission_matrix)
    for seq in possible_sequences
}

print("\nSequence Probabilities:")
for seq, prob in sequence_probabilities.items():
    print(f"{seq}: {prob}")

most_likely_sequence = max(sequence_probabilities, key=sequence_probabilities.get)
print("\nMost Likely Sequence:")
print(most_likely_sequence)

Initial Probability Distribution:
{'H': 0.6, 'S': 0.4}

Transition Matrix:
{('H', 'H'): 0.6271186440677966, ('H', 'S'): 0.3389830508474576, ('S', 'H'): 0.4222222222222222, ('S', 'S'): 0.5111111111111111}

Emission Matrix:
{('H', 'R'): 0.7, ('H', 'G'): 0.2833333333333333, ('H', 'B'): 0.016666666666666666, ('S', 'R'): 0.021739130434782608, ('S', 'G'): 0.15217391304347827, ('S', 'B'): 0.8260869565217391}

Sequence Probabilities:
('H', 'H', 'H'): 0.0007800009575792395
('H', 'H', 'S'): 0.0002264466732448197
('H', 'S', 'H'): 0.014069925489232785
('H', 'S', 'S'): 0.009147629575042988
('S', 'H', 'H'): 1.0872749438212469e-05
('S', 'H', 'S'): 3.1565319444592782e-06
('S', 'S', 'H'): 0.0004392198962247271
('S', 'S', 'S'): 0.00028556092324208263

Most Likely Sequence:
('H', 'S', 'H')
