# Análise de Serinas Fosforiláveis em *S. cerevisiae*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", context="talk")

In [None]:
features = pd.read_csv("uniprotkb_s_cerevisiae.gff", sep='\t', comment='#', header=None, low_memory=False)
features.columns = ['Entry', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes', 'extra']

seq_df = pd.read_csv("uniprotkb_s_cerevisiae.tsv", sep="\t", usecols=["Entry", "Sequence"])

In [None]:
def filtrar_fosfoserinas(features):
    fosfo = features[
        (features["type"] == "Modified residue") &
        (features["attributes"].str.contains("Phosphoserine", case=False, na=False))
    ]
    fosfo_dict = {}
    for _, row in fosfo.iterrows():
        entry = row["Entry"]
        pos = int(row["start"])
        fosfo_dict.setdefault(entry, set()).add(pos)
    return fosfo_dict

fosfo_dict = filtrar_fosfoserinas(features)

In [None]:
def extrair_serinas(seq_df, fosfo_dict):
    records = []
    for _, row in seq_df.iterrows():
        entry = row["Entry"]
        seq = row["Sequence"]
        fosfo_pos = fosfo_dict.get(entry, set())
        for i, aa in enumerate(seq):
            if aa == 'S':
                pos_1based = i + 1
                known_P = pos_1based in fosfo_pos
                window = [seq[i + offset] if 0 <= i + offset < len(seq) else 'X' for offset in range(-10, 11)]
                records.append([entry, pos_1based, known_P] + window)
    return records

records = extrair_serinas(seq_df, fosfo_dict)

In [None]:
def construir_dataframe(records):
    columns = ["entry", "pos", "known P"] + [str(i) for i in range(-10, 11)]
    return pd.DataFrame(records, columns=columns)

df_serinas = construir_dataframe(records)

In [None]:
df_knownP = df_serinas[df_serinas["known P"] == True]
df_notP = df_serinas[df_serinas["known P"] == False]

In [None]:
def calcular_log2_frequencias(df):
    posicoes = [str(i) for i in range(-10, 11) if i != 0]
    aminoacidos = list("ACDEFGHIKLMNPQRSTVWY")

    aa_counts_global = pd.Series(0, index=aminoacidos, dtype=int)
    for pos in posicoes:
        aa_counts_global += df[pos].value_counts().reindex(aminoacidos, fill_value=0)

    total_global = aa_counts_global.sum()
    f_global = aa_counts_global / total_global

    log2_matrix = pd.DataFrame(index=aminoacidos, columns=posicoes, dtype=float)
    for pos in posicoes:
        counts = df[pos].value_counts().reindex(aminoacidos, fill_value=0)
        f_pos = counts / counts.sum()
        log2_matrix[pos] = np.log2(f_pos / f_global)

    return log2_matrix

log2_knownP = calcular_log2_frequencias(df_knownP)
log2_notP = calcular_log2_frequencias(df_notP)

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(log2_knownP, cmap="seismic", center=0, cbar_kws={"label": "log₂(f / f_global)"})
plt.title("Serinas Fosforiláveis - log₂(f / f_global)")
plt.xlabel("Posição relativa à Serina")
plt.ylabel("Aminoácido")
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(log2_notP, cmap="seismic", center=0, cbar_kws={"label": "log₂(f / f_global)"})
plt.title("Serinas Não Fosforiláveis - log₂(f / f_global)")
plt.xlabel("Posição relativa à Serina")
plt.ylabel("Aminoácido")
plt.show()

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

def plotar_heatmap(df_log2, titulo):
    plt.figure(figsize=(14, 7))
    sns.heatmap(
        df_log2,
        cmap="seismic",
        center=0,
        vmin=-3,
        vmax=3,
        annot=True,
        fmt=".2f",
        linewidths=0.1,
        cbar_kws={'label': 'log₂(f / f_global)'}
    )
    plt.title(titulo, fontsize=14)
    plt.xlabel("Posição relativa à Serina")
    plt.ylabel("Aminoácido")
    plt.tight_layout()
    plt.show()

# Plot para serinas fosforiláveis
plotar_heatmap(log2_knownP, "Serinas Fosforiláveis - log₂(f / f_global)")

# Plot para serinas não fosforiláveis
plotar_heatmap(log2_notP, "Serinas Não Fosforiláveis - log₂(f / f_global)")
