# Filter peak set by spm

In [1]:
import pandas as pd
import os
from pathlib import Path

In [2]:
root_dir = Path("/tscc/projects/ps-epigen/users/biy022/biccn/data/SNAREdata/")
os.chdir(root_dir)
result_dir = Path("subclass_regional_ldsc/")

### Filtering based on spm ranking

In [5]:
for subclass_dir in Path("subclass_regional_fragments_peaks/").iterdir():
    print(subclass_dir)
    subclass = subclass_dir.stem
    
    N = 5000
    union_table = pd.read_csv(
        subclass_dir / "regional_raw_peaks/union_peaks.filteredNfixed.union.peakSet",
        sep="\t", header=0
    )
    union_table_filtered = union_table[union_table["spm"] > 2.0]
    union_table_filtered_selected = union_table_filtered.sort_values(
        by=["label", "spm"], ascending=[True, False]).groupby("label").head(N)
    
    result_dir_subclass = result_dir / subclass / "peaks"
    result_dir_subclass.mkdir(exist_ok=True, parents=True)
    for label, subdf in union_table_filtered_selected.groupby("label"):
        subdf[["seqnames", "start", "end", "name", "score", "strand"]].to_csv(
            result_dir_subclass / "{}.bed".format(label),
            header=False, sep="\t", index=False
        )
    union_table_filtered[["seqnames", "start", "end", "name", "score", "strand"]].to_csv(
        result_dir_subclass / "union_set.bed",
        header=False, sep="\t", index=False
    )

subclass_regional_fragments_peaks/L2_3_IT
subclass_regional_fragments_peaks/L6_CT
subclass_regional_fragments_peaks/PVALB
subclass_regional_fragments_peaks/L5_6_NP
subclass_regional_fragments_peaks/L5_ET
subclass_regional_fragments_peaks/LAMP5_LHX6
subclass_regional_fragments_peaks/Endo
subclass_regional_fragments_peaks/L4_IT
subclass_regional_fragments_peaks/PAX6
subclass_regional_fragments_peaks/LAMP5
subclass_regional_fragments_peaks/Oligo
subclass_regional_fragments_peaks/L5_IT
subclass_regional_fragments_peaks/Astro
subclass_regional_fragments_peaks/SNCG
subclass_regional_fragments_peaks/VLMC
subclass_regional_fragments_peaks/L6_IT_Car3
subclass_regional_fragments_peaks/L6_IT
subclass_regional_fragments_peaks/Micro_PVM
subclass_regional_fragments_peaks/OPC
subclass_regional_fragments_peaks/SST
subclass_regional_fragments_peaks/SST_CHODL
subclass_regional_fragments_peaks/Chandelier
subclass_regional_fragments_peaks/VIP
subclass_regional_fragments_peaks/L6B


### Filtering based on spm cutoff (>2)

In [6]:
for subclass_dir in Path("subclass_regional_fragments_peaks/").iterdir():
    subclass = subclass_dir.stem
    print(subclass)
    
    union_table = pd.read_csv(
        subclass_dir / "regional_raw_peaks/union_peaks.filteredNfixed.union.peakSet",
        sep="\t", header=0
    )
    union_table_filtered = union_table[union_table["spm"] > 2.0]
    union_table_filtered.to_csv(
        subclass_dir / "regional_raw_peaks/union_peaks_spm_filtered.bed",
        header=False, sep="\t", index=False
    )

L2_3_IT
L6_CT
PVALB
L5_6_NP
L5_ET
LAMP5_LHX6
Endo
L4_IT
PAX6
LAMP5
Oligo
L5_IT
Astro
SNCG
VLMC
L6_IT_Car3
L6_IT
Micro_PVM
OPC
SST
SST_CHODL
Chandelier
VIP
L6B


### Extract summit base for filtered peaks

In [3]:
os.chdir("subclass_regional_fragments_peaks")

In [7]:
for subclass_dir in Path("./").iterdir():
    # print(subclass_dir)
    if subclass_dir.stem == "baseset":
        continue
    union_table = pd.read_csv(
        "{}/regional_raw_peaks/union_peaks_spm_filtered.bed".format(subclass_dir),
        header=None, sep="\t", index_col=None
    )
    union_table = union_table[[0, 1, 2, 6, 10]]
    union_table.columns = ["seqname", "start", "end", "name", "score"]
    union_table["start"] = union_table["start"] + 249
    union_table["end"] = union_table["start"] + 1
    union_table.to_csv(
        "{}/regional_raw_peaks/union_peaks_spm_filtered_summits.bed".format(subclass_dir),
        header=False, sep="\t", index=False
    )