In [None]:
import glob
import re
from os.path import dirname, basename
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm

residues = [
    'ALA', 'ARG', 'ASN', 'ASP', 'ASH', 
    'CYS', 'GLN', 'GLU', 'GLH', 'GLY',
    'HIS', 'HIP', 'ILE', 'LEU', 'LYS',
    'MET', 'PHE', 'PRO', 'TRP', 'VAL',
    'SER', 'S1P', 'SEP',
    'THR', 'T1P', 'TPO',
    'TYR', 'Y1P', 'PTR'
]

%matplotlib inline

### Code to write the dssp summaries
```python
def get_residues(fpath : str, start_index=2):
    d = {'g': 'GLY'}
    resiter = re.finditer(r'(g|[A-Z0-9]{3})', basename(dirname(dirname(fpath))))
    return [(i, d.get(x,x)) for i,x in enumerate((x.group() for x in resiter), start_index)]

def summarize_dssp(fpath : str):
    # Get residues
    reslist = get_residues(fpath)
    dsspsum = np.zeros((9, len(reslist)), dtype=float)
    
    with open(fpath, "r") as fhandle:
        for ln in fhandle:
            for i, sscode in enumerate(ln.strip()[1:-1]):
                j = "HBEGIPTS~".index(sscode)
                dsspsum[j,i] += 1
    sums = np.sum(dsspsum, axis=0)
    assert (sums == sums[0]).all()
    dsspsum /= sums
    
    # Write as csv
    df = pd.DataFrame({
        "#ResIndex": [a for a,b in reslist], 
        "ResName": [b for a,b in reslist],
        **{f"dssp[{ss}]": dsspsum[i] for i, ss in enumerate("HBEGIPTSC")}
    })
    return df

for fpath in tqdm.tqdm(glob.glob("../*peptides/gg*/analy/dssp.dat")):
    df = summarize_dssp(fpath)
    df.to_csv(dirname(fpath) + "/dssp_summary.dat", float_format="%.4f", index=False)
```

# Analyze the DSSP summary files

### Load DSSP-summary files

In [None]:
dsspdata = pd.DataFrame()

for fpath in tqdm.tqdm(glob.glob("../*peptides/gg*/analy/dssp_summary.dat")):
    df = pd.read_csv(fpath)
    df["plength"] = df.shape[0]
    df["psequence"] = "-".join(df["ResName"])
    dsspdata = pd.concat([dsspdata, df], axis=0)

In [None]:



for attr in (f"dssp[{ss}]" for ss in "HBEGIPTSC"):
    # Data
    xvals = np.arange(len(residues))
    data = [dsspdata.loc[(dsspdata["ResName"] == res) & (dsspdata["plength"] == 9)][attr] for res in residues]

    # Plot
    fig, ax = plt.subplots(figsize=(8,2))
    ax.set_title(attr)
    ax.violinplot(data, positions=xvals, widths=.8)
    ax.set_xticks(xvals)
    ax.set_xticklabels(residues, rotation=90.)