In [69]:
import pandas as pd
import numpy as np

# MATRIZ DE TRANSICIÓN DE ESTADOS PARA EL ENGAGEMENT

df = pd.read_csv("/Users/joshchaidez/Desktop/Globant_Motivation_Prediction/data/data_globant.csv")

# Separar 10% de las personas para testeo
unique_names = df["Name"].unique()
np.random.seed(42)
test_names = np.random.choice(unique_names, size=int(0.05 * len(unique_names)), replace=False)
train_df = df[~df["Name"].isin(test_names)].reset_index(drop=True)
test_df = df[df["Name"].isin(test_names)].reset_index(drop=True)

# Discretizar
def compute_transition_matrices(
    df,
    numero_estados=10,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=None,
):
    """
    Devuelve (transition_counts, transition_prob, bins, labels).
    - numero_estados: cantidad de estados discretos.
    - threshold_counts: mínimo conteo para mantener en transition_counts.
    - threshold_prob: mínimo probabilidad para mantener en transition_prob (se renormaliza luego).
    - value_col: columna con la variable continua a discretizar.
    - id_col: columna que identifica la serie (por ejemplo 'Name').
    - sort_cols: columnas para ordenar antes de generar transiciones (por defecto [id_col,'Year','Month','Day'] si existen).
    """
    if sort_cols is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in df.columns:
                default_sort.append(c)
        sort_cols = default_sort

    # bins y labels
    bins = np.linspace(0, 5, numero_estados + 1)
    labels = bins[1:]  # etiqueta por cada intervalo derecho (ej. 0.5, 1.0, ...)
    
    # trabajar sobre copia para no mutar df original
    tmp = df.copy()
    tmp["Engagement_bin"] = pd.cut(tmp[value_col], bins=bins, labels=labels, include_lowest=True)
    tmp = tmp.sort_values(by=sort_cols)

    # inicializar matriz de conteos
    transition_counts = pd.DataFrame(0, index=labels, columns=labels, dtype=int)

    # contar transiciones
    for _, group in tmp.groupby(id_col):
        states = group["Engagement_bin"].dropna().astype(float).values
        for s1, s2 in zip(states[:-1], states[1:]):
            transition_counts.loc[s1, s2] += 1

    # probabilidades (normalizar por fila)
    transition_prob = transition_counts.div(transition_counts.sum(axis=1).replace(0, np.nan), axis=0)

    # aplicar umbral de conteos
    if threshold_counts is not None and threshold_counts > 0:
        transition_counts = transition_counts.where(transition_counts >= threshold_counts, 0)

    # aplicar umbral de probabilidad y renormalizar filas
    if threshold_prob is not None and threshold_prob > 0:
        transition_prob = transition_prob.where(transition_prob >= threshold_prob, 0)
        transition_prob = transition_prob.div(transition_prob.sum(axis=1).replace(0, np.nan), axis=0)

    return transition_counts, transition_prob, bins, labels

In [70]:
# Calcular matrices de transición para 10 y 5 estados con los datos de entrenamiento
transition_counts10, transition_probs10, bins, labels = compute_transition_matrices(
    train_df,
    numero_estados=10,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=["Name", "Month", "Day"],
)

transition_probs10

transition_counts5, transition_probs5, bins, labels = compute_transition_matrices(
    train_df,
    numero_estados=5,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=["Name", "Month", "Day"],
)

transition_probs5

Unnamed: 0,1.0,2.0,3.0,4.0,5.0
1.0,0.411371,0.055184,0.324415,0.20903,0.0
2.0,0.071882,0.663848,0.264271,0.0,0.0
3.0,0.0,0.0,0.756668,0.243332,0.0
4.0,0.0,0.0,0.221652,0.698192,0.080156
5.0,0.0,0.0,0.0,0.255539,0.744461


In [71]:
# Testear las matrices de transición en los datos de testeo
def test_transition_matrix(df, transition_probs, state_col="Engagement_bin", id_col="Name", sort_cols=None):
    """
    Devuelve la precisión de predicción usando la matriz de transición dada.
    """
    if sort_cols is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in df.columns:
                default_sort.append(c)
        sort_cols = default_sort

    tmp = df.copy()
    tmp["Engagement_bin"] = pd.cut(tmp["Engagement"], bins=bins, labels=labels, include_lowest=True)
    tmp = tmp.sort_values(by=sort_cols)

    total_predictions = 0
    correct_predictions = 0

    for _, group in tmp.groupby(id_col):
        states = group[state_col].dropna().astype(float).values
        for s1, s2 in zip(states[:-1], states[1:]):
            if s1 in transition_probs.index and s2 in transition_probs.columns:
                predicted_state = transition_probs.loc[s1].idxmax()
                total_predictions += 1
                if predicted_state == s2:
                    correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

In [72]:
# Testear las matrices de 5 y 10 estados
accuracy_10 = test_transition_matrix(test_df, transition_probs10)
accuracy_5 = test_transition_matrix(test_df, transition_probs5)

accuracy_10, accuracy_5

(0.73, 0.73)

In [73]:
# Matriz de transición de orden n
def compute_nth_order_markov(df, order_n=2, state_col="Engagement_bin", id_col="Name", sort_keys=None, possible_states=None):
    """
    Calcula la matriz de transición de orden n.
    Devuelve (transition_counts, transition_probs).
    - order_n: orden de la cadena de Markov.
    - state_col: columna con los estados discretos.
    - id_col: columna que identifica la serie (por ejemplo 'Name').
    - sort_keys: columnas para ordenar antes de generar transiciones.
    - possible_states: lista de estados posibles (etiquetas).
    """
    if sort_keys is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in df.columns:
                default_sort.append(c)
        sort_keys = default_sort

    if possible_states is None:
        possible_states = df[state_col].dropna().unique()

    # Inicializar matriz de conteos
    index = pd.MultiIndex.from_product([possible_states] * order_n, names=[f"State_{i+1}" for i in range(order_n)])
    columns = possible_states
    transition_counts = pd.DataFrame(0, index=index, columns=columns, dtype=int)

    # Contar transiciones
    for _, group in df.sort_values(by=sort_keys).groupby(id_col):
        states = group[state_col].dropna().astype(float).values
        for i in range(len(states) - order_n):
            prev_states = tuple(states[i:i + order_n])
            next_state = states[i + order_n]
            transition_counts.loc[prev_states, next_state] += 1

    # Calcular probabilidades
    transition_probs = transition_counts.div(transition_counts.sum(axis=1).replace(0, np.nan), axis=0)

    return transition_counts, transition_probs


# Testear matriz de segundo orden
def test_nth_order_transition_matrix(df, transition_probs, order_n=2, state_col="Engagement_bin", id_col="Name", sort_keys=None):
    """
    Devuelve la precisión de predicción usando la matriz de transición de orden n dada.
    """
    if sort_keys is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in df.columns:
                default_sort.append(c)
        sort_keys = default_sort

    tmp = df.copy()
    tmp["Engagement_bin"] = pd.cut(tmp["Engagement"], bins=bins, labels=labels, include_lowest=True)
    tmp = tmp.sort_values(by=sort_keys)

    total_predictions = 0
    correct_predictions = 0

    for _, group in tmp.groupby(id_col):
        states = group[state_col].dropna().astype(float).values
        for i in range(len(states) - order_n):
            prev_states = tuple(states[i:i + order_n])
            next_state = states[i + order_n]
            if prev_states in transition_probs.index and next_state in transition_probs.columns:
                predicted_state = transition_probs.loc[prev_states].idxmax()
                total_predictions += 1
                if predicted_state == next_state:
                    correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

In [75]:
# Matriz de segundo orden con 5 estados
# ===============================
transition_counts_2nd_order_5, transition_probs_2nd_order_5 = compute_nth_order_markov(
    train_df,
    order_n=2,
    state_col="Engagement_bin",
    id_col="Name",
    sort_keys=["Name", "Month", "Day"],
    possible_states=labels
)
transition_probs_2nd_order_5

# Testar matriz de segundo orden
accuracy_2nd_order_5 = test_nth_order_transition_matrix(
    test_df,
    transition_probs_2nd_order_5,
    order_n=2,
    state_col="Engagement_bin",
    id_col="Name",
    sort_keys=["Name", "Month", "Day"]
)
accuracy_2nd_order_5

KeyError: 'Engagement_bin'