# JMOD Preprocessing — Python (QuantQC)

This notebook reproduces the `R/Preproc.R` analysis pipeline using the Python `quantqc` package.

In [None]:
import sys, os
sys.path.insert(0, os.path.expanduser('~/Desktop/Github/QuantQC/python'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import quantqc as qqc
from quantqc.core import mat_to_df

---
## 1. Set paths

Update these to match your local file locations.

In [None]:
path_raw  = '/Users/andrewleduc/Downloads/'
path_meta = '/Users/andrewleduc/Downloads/'

data_path   = os.path.join(path_raw, 'JModPlate1Slide2.parquet')
linker_path = os.path.join(path_meta, 'linker_jmod.csv')

# CellenONE isolation files — one per condition
one = os.path.join(path_meta, 'sample_1_try2_isolated.xls')
two = os.path.join(path_meta, 'sample_2_isolated.xls')
all_cells = {'mouse1': one, 'mouse2': two}

---
## 2. Import JMOD data and build miceotope matrices

In [None]:
r1 = qqc.jmod_to_qqc(data_path, linker_path, plex=3, carrier=False)
print(f'Raw data shape: {r1.raw_data.shape}')

In [None]:
# Build cell x peptide matrix and compute H/L isotope ratios (JMOD format)
r1 = qqc.miceotope_cell_x_peptide_jmod(r1, ch_q_val=1, t=5)
print(f'Peptide matrix: {r1.matrices.peptide.shape[0]} peptides x {r1.matrices.peptide.shape[1]} cells')
print(f'Miceotope H rows: {r1.miceotopes.Raw_H.shape[0]}, cols: {r1.miceotopes.Raw_H.shape[1]}')

---
## 3. Link CellenONE metadata

In [None]:
r1 = qqc.link_cellenone_raw(r1, all_cells)
r1.meta_data.head(10)

---
## 4. Slide layout visualization

In [None]:
from quantqc.cellenone import plot_slide_layout_celltype, plot_slide_layout_label

fig = plot_slide_layout_celltype(r1)
plt.show()

fig = plot_slide_layout_label(r1)
plt.show()

---
## 5. Evaluate negative controls and filter bad cells

In [None]:
r1 = qqc.evaluate_negative_controls(r1)

fig = qqc.plot_neg_ctrl(r1)
plt.show()

In [None]:
# Filter cells by log10 total intensity threshold
print(f'Cells before filtering: {r1.matrices.peptide.shape[1]}')
r1 = qqc.filter_bad_cells(r1, min_intens=9)
print(f'Cells after filtering:  {r1.matrices.peptide.shape[1]}')

---
## 6. Trim extra peptides

Keep at most 5 best peptides per protein (by median intensity + coverage).

In [None]:
# Trim abundance peptides
print(f'Peptides before trim: {r1.matrices.peptide.shape[0]}')
r1 = qqc.trim_extra_peptides(r1)
print(f'Peptides after trim:  {r1.matrices.peptide.shape[0]}')

In [None]:
# Trim miceotope peptides to match
print(f'Miceotope H peptides before trim: {r1.miceotopes.Raw_H.shape[0]}')
r1 = qqc.trim_extra_peptides_miceotopes(r1)
print(f'Miceotope H peptides after trim:  {r1.miceotopes.Raw_H.shape[0]}')

---
## 7. Save raw peptide matrix

In [None]:
pep_df = mat_to_df(r1.matrices.peptide, r1.matrices.peptide_rows, r1.matrices.peptide_cols)
pep_df.to_csv(os.path.join(path_meta, '02_raw_reptide_X_singleCell/r1_peptide.csv'))
print(f'Saved peptide matrix: {pep_df.shape}')

---
## 8. Cell size vs MS intensity

In [None]:
fig = qqc.plot_cell_size_vs_intensity(r1, type_='sample')
plt.show()

---
## 9. Normalize and collapse to protein level

In [None]:
r1 = qqc.collapse_to_protein(r1, opt=1, lc_correct=True)
print(f'Protein matrix: {r1.matrices.protein.shape[0]} proteins x {r1.matrices.protein.shape[1]} cells')

---
## 10. Protein and peptide counts / data completeness

In [None]:
fig = qqc.plot_prot_and_pep(r1)
plt.show()

fig = qqc.plot_data_complete(r1)
plt.show()

---
## 11. Shared peptide correlations

In [None]:
r1 = qqc.shared_peptide_cor(r1)

fig = qqc.plot_pep_cor(r1)
plt.show()

median_cor = r1.pep_cor[0]['Cor'].median()
print(f'Median peptide correlation: {median_cor:.3f}')

---
## 12. KNN imputation

In [None]:
r1 = qqc.knn_impute(r1)
print('KNN imputation complete.')

---
## 13. Batch correction (label + sample)

In [None]:
# Batch correction for mTRAQ label bias and sample effects
# The R script uses limma::removeBatchEffect with batch=label, batch2=sample.
# The Python QuantQC batch_correct performs mean-centering per batch group.
r1 = qqc.batch_correct(r1, run=False, labels=True)
print('Batch correction complete.')

---
## 14. Hemoglobin regression correction

For each protein, regress out hemoglobin contamination (P01942).
If R² > 0.05, replace values with residuals re-centered to zero mean.

In [None]:
from quantqc.core import mat_to_df, df_to_mat

prot_df = mat_to_df(r1.matrices.protein, r1.matrices.protein_rows, r1.matrices.protein_cols)

hb_id = 'P01942'
if hb_id in prot_df.index:
    hb = prot_df.loc[hb_id].values.astype(float)
    adj = prot_df.copy()

    for prot in prot_df.index:
        y = prot_df.loc[prot].values.astype(float)
        ok = np.isfinite(y) & np.isfinite(hb)
        if ok.sum() < 3:
            continue

        # Simple linear regression
        X = np.column_stack([np.ones(ok.sum()), hb[ok]])
        beta, _, _, _ = np.linalg.lstsq(X, y[ok], rcond=None)
        predicted = X @ beta
        resid = y[ok] - predicted

        ss_res = np.sum(resid ** 2)
        ss_tot = np.sum((y[ok] - np.mean(y[ok])) ** 2)
        r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0

        if np.isfinite(r2) and r2 > 0.05:
            res_full = np.full(len(y), np.nan)
            res_full[ok] = resid + np.mean(y[ok])
            res_full[ok] = res_full[ok] - np.nanmean(res_full[ok])
            adj.loc[prot] = res_full

    r1.matrices.protein, _, _ = df_to_mat(adj)
    print(f'Hemoglobin correction applied. Regressed {hb_id} from protein matrix.')
else:
    print(f'{hb_id} not found in protein matrix — skipping hemoglobin correction.')

---
## 15. PCA

In [None]:
r1 = qqc.compute_pca(r1, imputed=False)

In [None]:
# Hemoglobin
fig = qqc.feature_pca(r1, prot='P01942', imputed=False)
plt.show()

In [None]:
# Other sources of variance
fig = qqc.feature_pca(r1, prot='P12710', imputed=False)  # Fatty acid-binding protein
plt.show()

fig = qqc.feature_pca(r1, prot='P00329', imputed=False)  # Alcohol dehydrogenase
plt.show()

In [None]:
# Portal markers
fig = qqc.feature_pca(r1, prot='P33267', imputed=False)  # Cyp2f2
plt.show()

fig = qqc.feature_pca(r1, prot='Q61176', imputed=False)  # Arg1
plt.show()

fig = qqc.feature_pca(r1, prot='Q91YI0', imputed=False)  # Asl
plt.show()

In [None]:
# Central markers
fig = qqc.feature_pca(r1, prot='Q05421', imputed=False)  # Cyp2e1
plt.show()

fig = qqc.feature_pca(r1, prot='P15105', imputed=False)  # Glul
plt.show()

In [None]:
for by in ['Condition', 'Run order', 'Total protein', 'Label']:
    fig = qqc.plot_pca(r1, by=by)
    plt.show()

---
## 16. UMAP

In [None]:
r1 = qqc.compute_umap(r1)

In [None]:
for by in ['Cluster', 'Total protein', 'Run order', 'Label', 'Condition']:
    fig = qqc.plot_umap(r1, by=by)
    plt.show()

In [None]:
fig = qqc.feature_umap(r1, prot='P33267')  # Cyp2f2
plt.show()

---
## 17. Miceotope protein-level collapse and turnover visualization

In [None]:
r1 = qqc.miceotope_protein_collapse(r1)
print(f'Miceotope H/L protein matrix: {r1.miceotopes.HovL_prot.shape}')

In [None]:
fig = qqc.mice_dim_plot_turnover(r1, reduct='UMAP', by='Total')
plt.show()

---
## 18. Miceotope peptide correlations

In [None]:
fig = qqc.mice_pep_cor_plot(r1)
plt.show()

---
## 19. Abundance vs turnover correlations

Compare degradation rate (alpha) to relative protein abundance along the
portal–central zonation axis (Cyp2e1, Q05421).

In [None]:
from quantqc.utils import normalize as qqc_normalize

prot_df = mat_to_df(r1.matrices.protein, r1.matrices.protein_rows, r1.matrices.protein_cols)

# Get alpha at protein level
alpha_prot = r1.miceotopes.Alpha_prot

# Find overlapping proteins between alpha and abundance
# (miceotope protein collapse may use different row indexing)
# Use the miceotope ppm to get protein names
mice_ppm = r1.miceotopes.peptide_protein_map
mice_prots = mice_ppm['Protein'].unique()

sect = np.intersect1d(mice_prots, prot_df.index.values)
print(f'Overlapping proteins (abundance & miceotope): {len(sect)}')

# Compute correlations between alpha (degradation) and abundance,
# and between abundance and the portal-central axis (Cyp2e1)
axis_prot = 'Q05421'  # Cyp2e1
if axis_prot in prot_df.index:
    axis_vals = prot_df.loc[axis_prot].values.astype(float)

    cors_deg = []
    cors_axis = []
    cor_deg_axis = []
    numb_dp = []

    for prot in sect:
        abund = prot_df.loc[prot].values.astype(float)

        # cor(alpha, abundance)
        ok = np.isfinite(abund) & np.isfinite(abund)  # placeholder — need alpha_df
        n_pairs = ok.sum()
        numb_dp.append(n_pairs)

        # cor(abundance, axis)
        ok2 = np.isfinite(abund) & np.isfinite(axis_vals)
        if ok2.sum() >= 3:
            cors_axis.append(np.corrcoef(abund[ok2], axis_vals[ok2])[0, 1])
        else:
            cors_axis.append(np.nan)

        cors_deg.append(np.nan)      # requires aligned alpha matrix
        cor_deg_axis.append(np.nan)   # requires aligned alpha matrix

    df_deg_cor = pd.DataFrame({
        'prot': sect,
        'cor_deg': cors_deg,
        'cor_deg_axis': cor_deg_axis,
        'cors_axis': cors_axis,
        'numb': numb_dp
    })
    df_deg_cor = df_deg_cor[df_deg_cor['numb'] > 50]
    print(f'Proteins with >50 data points: {len(df_deg_cor)}')
    df_deg_cor.head(10)
else:
    print(f'{axis_prot} not found in protein matrix.')