In [62]:
def load(
        filename="2020-12-31-published-sars-cov2-tcell-epitopes-corrected-protein-names.csv",
        assays={"AIM", "IFN-γ ELISpot", "MIRA", "T-Scan", "pMHC Multimer"},
        species="Human",
        exposure="infection"):
    df = pd.read_csv(filename)
    n_total = len(df)
    print("Loaded %d rows (%d studies) from %s" % (n_total, len(df.Source.unique()), filename))
    df = df[df.Species.str.lower() == species.lower()]
    print("Kept %d/%d rows (%d studies) with species = %s" % (len(df), n_total, len(df.Source.unique()), species))
    df = df[df.Exposure.str.lower() == exposure.lower()]
    print("Kept %d/%d rows (%d studies) with exposure = %s" % (len(df), n_total, len(df.Source.unique()), exposure))
    lower_assays = {assay_name.lower() for assay_name in assays}
    assay_mask = df.Assay.str.lower().isin(lower_assays)
    assays_not_kept = df[~assay_mask].Assay.unique()
    df = df[assay_mask]
    print("Kept %d/%d rows (%d studies) with valid assay, dropping assays: %s" % (
        len(df),
        n_total, 
        len(df.Source.unique()),
        assays_not_kept))
    no_percent_responding = df["% Responding"].isnull()
    df = df[~no_percent_responding]
    print("Kept %d/%d rows (%d studies) with data for '%% Responding'" % (
        len(df),
        n_total, 
        len(df.Source.unique())))
    
    return df

In [63]:
df = load()

Loaded 962 rows (12 studies) from 2020-12-31-published-sars-cov2-tcell-epitopes-corrected-protein-names.csv
Kept 947/962 rows (10 studies) with species = Human
Kept 947/962 rows (10 studies) with exposure = infection
Kept 943/962 rows (8 studies) with valid assay, dropping assays: ['Single chain trimer' 'TCR-seq']
Kept 938/962 rows (8 studies) with data for '% Responding'


In [64]:
df.Source.value_counts()

Tarke          803
Peng            41
Schulien        36
Ferretti        29
Nelde           17
Le Bert          8
Shomuradova      2
Snyder           2
Name: Source, dtype: int64

In [65]:
studies = sorted(df.Source.unique())

In [66]:
df["# Responding"].value_counts()

1.0     673
2.0     130
3.0      55
4.0      22
5.0      17
6.0       8
7.0       7
10.0      5
8.0       4
12.0      4
11.0      3
9.0       3
13.0      2
16.0      2
14.0      1
17.0      1
21.0      1
Name: # Responding, dtype: int64

In [67]:
from collections import Counter
epitope_to_study_count = Counter()
epitope_to_study_count_10aa = Counter()
epitope_to_study_count_20aa = Counter()
epitope_to_study_count_30aa = Counter()

for prot, start, end in zip(df.Protein, df.Start, df.End):
    epitope_to_study_count["%s %d-%d" % (prot, start, end)] += 1
    epitope_to_study_count_10aa["%s %d" % (prot, (start // 10) * 10)] += 1
    epitope_to_study_count_20aa["%s %d" % (prot, (start // 20) * 20)] += 1
    epitope_to_study_count_30aa["%s %d" % (prot, (start // 30) * 30)] += 1
    

In [68]:
epitope_20aa_to_study_to_percent = {}
for prot, start, source, prct in zip(df.Protein, df.Start, df.Source, df["% Responding"]):
    epitope = "%s %d" % (prot, (start // 20) * 20)
    if epitope not in epitope_20aa_to_study_to_percent:
        epitope_20aa_to_study_to_percent[epitope] = {}
    epitope_20aa_to_study_to_percent[epitope][source] = prct
    

In [69]:
epitope_20aa_to_study_to_percent

{'nsp7 20': {'Ferretti': 0.8888888888888888, 'Le Bert': 0.14285714285714285},
 'S 260': {'Ferretti': 0.7777777777777778,
  'Shomuradova': 0.9285714285714286,
  'Tarke': 0.25},
 'ORF3a 120': {'Ferretti': 0.8888888888888888,
  'Schulien': 1.0,
  'Tarke': 0.07142857142857142},
 'nsp8 140': {'Ferretti': 0.6},
 'N 220': {'Ferretti': 0.2222222222222222,
  'Nelde': 0.6363636363636364,
  'Schulien': 0.2727272727272727,
  'Tarke': 0.07142857142857142},
 'nsp3 80': {'Ferretti': 0.3333333333333333},
 'ORF3a 200': {'Ferretti': 1.0,
  'Peng': 0.09523809523809523,
  'Schulien': 0.8333333333333334,
  'Tarke': 0.08333333333333333},
 'nsp3 800': {'Ferretti': 1.0,
  'Nelde': 0.8333333333333334,
  'Snyder': 0.8333333333333334,
  'Tarke': 0.09090909090909093},
 'nsp3 500': {'Ferretti': 0.8, 'Tarke': 0.6666666666666666},
 'M 160': {'Ferretti': 0.6,
  'Nelde': 0.9545454545454546,
  'Peng': 0.3809523809523809,
  'Tarke': 0.15384615384615385},
 'nsp9 20': {'Ferretti': 1.0, 'Schulien': 0.5},
 'nsp12 720': {'Fe

In [70]:
epitope_to_study_count_20aa.most_common()

[('N 300', 15),
 ('N 320', 15),
 ('M 160', 14),
 ('N 100', 13),
 ('S 20', 12),
 ('S 680', 11),
 ('S 700', 11),
 ('M 180', 11),
 ('nsp3 800', 10),
 ('S 1200', 10),
 ('S 200', 10),
 ('S 340', 10),
 ('S 440', 10),
 ('S 860', 10),
 ('M 120', 10),
 ('S 360', 9),
 ('N 120', 9),
 ('ORF3a 100', 9),
 ('S 220', 9),
 ('S 160', 9),
 ('nsp3 1500', 9),
 ('S 320', 9),
 ('S 80', 9),
 ('ORF3a 200', 8),
 ('N 360', 8),
 ('M 140', 8),
 ('nsp3 1360', 8),
 ('S 620', 8),
 ('S 880', 8),
 ('M 80', 8),
 ('S 260', 7),
 ('ORF3a 120', 7),
 ('N 40', 7),
 ('N 80', 7),
 ('N 260', 7),
 ('S 180', 7),
 ('S 1000', 7),
 ('S 1040', 7),
 ('M 60', 7),
 ('M 100', 7),
 ('S 1080', 7),
 ('nsp3 820', 7),
 ('M 20', 7),
 ('N 220', 6),
 ('N 280', 6),
 ('M 40', 6),
 ('S 500', 6),
 ('S 740', 6),
 ('N 0', 6),
 ('N 60', 6),
 ('S 140', 6),
 ('nsp3 1760', 6),
 ('S 820', 6),
 ('S 100', 6),
 ('ORF3a 80', 6),
 ('nsp3 1120', 6),
 ('S 800', 5),
 ('S 1180', 5),
 ('N 340', 5),
 ('ORF3a 140', 5),
 ('ORF3a 220', 5),
 ('N 240', 5),
 ('S 40', 5),
 (

In [117]:
latex_header = r"""
\documentclass{article}
\usepackage{amsmath}
\usepackage{array}
\usepackage{booktabs}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{threeparttable}
\usepackage{wasysym}
\usepackage{array}
\usepackage{stackengine}
\usepackage{tikz}

\begin{document}

"""

latex_definitions = r"""

\makeatletter
\newcommand{\thickhline}{%
    \noalign {\ifnum 0=`}\fi \hrule height 1pt
    \futurelet \reserved@a \@xhline
}
\newcolumntype{"}{@{\hskip\tabcolsep\vrule width 1pt\hskip\tabcolsep}}
\makeatother

% rotated text for column headers
\newcommand*\rot[1]{\hbox to1em{\hss\rotatebox[origin=br]{-60}{#1}}}

% circles 
\newcommand*\feature[1]{\ifcase#1 -\or\LEFTcircle\or\CIRCLE\fi}

% column type which is fixed width and centered
\newcolumntype{x}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}


\newcommand{\pie}[1]{%

\begin{tikzpicture}
 \draw (0,0) circle (1ex);\fill (1ex,0) arc (0:-#1:1ex) -- (0,0) -- cycle;
\end{tikzpicture}%
}

"""

latex_footer = r"""
\end{document}
"""

# r"\pie{90}"

def aggregate_epitopes_by_window(df, aa_window_size=20):
    from collections import Counter, defaultdict
    epitope_to_entry_count = Counter()
    epitope_to_studies = defaultdict(set)


    for prot, start, end, source in zip(df.Protein, df.Start, df.End, df.Source):
        key = "%s %d" % (prot, (start // aa_window_size) * aa_window_size)
        epitope_to_entry_count[key] += 1
        epitope_to_studies[key].add(source)
    epitope_to_study_count = Counter(**{k: len(studies) for k, studies in epitope_to_studies.items()})
    epitope_to_study_to_fraction = defaultdict(dict)
    for prot, start, source, frac in zip(df.Protein, df.Start, df.Source, df["% Responding"]):
        key = "%s %d" % (prot, (start // aa_window_size) * aa_window_size)
        assert  0 <= frac <= 1
        epitope_to_study_to_fraction[key][source] = max(
            frac,
            epitope_to_study_to_fraction[key].get(source, 0))
    
    return epitope_to_study_count, epitope_to_study_to_fraction
    
def build_table(df):
    # e.g. 
    # \begin{table}
    # \begin{tabular}{r cccccccccccc}
    
    study_names = sorted(df.Source.unique())
    epitope_to_study_count, epitope_to_study_to_fraction = \
        aggregate_epitopes_by_window(df)
    
    s = r""
    
    s += r"\begin{table}"
    s += "\n"
    s += r"\begin{tabular}"
    s += r"{r "
    n_studies = len(study_names)
    s += (r"c") * n_studies
    s += "}\n"
    
    
    # blank spot in the header for row names
    s += r"    &"
    s += "\n"
    
    for i, study in enumerate(study_names):
        s += r"""
             \rot{\textbf{"""
        s += study
        s += r"}}"
        if i + 1 < n_studies:
            s += r"    &"
        else:
            s += r"    \\"
            s += "\n"
            
            
    
    for epitope, count in epitope_to_study_count.most_common():
        print("%s: %d" % (epitope, count))
        study_to_fraction = epitope_to_study_to_fraction[epitope]
        s += r"\textbf{" + epitope + r"} &"
        s += "\n"
        for i, study in enumerate(study_names):
            if study in study_to_fraction:
                frac = study_to_fraction[study]
                s += "\t"
                s += r"\pie{" + str(int(360 * frac)) + r"} "
            else:
                s += "\t"
                s += r" - "
   
            if i + 1 < n_studies:
                s += r" &"
            else:
                s += r" \\"
                
            # newline (not a raw string)
            s += "\n"
        
    s += r"""
        \end{tabular}
        \end{table}
    """
    return s

example_latex_body = r"""
\begin{table}
\begin{tabular}{r cccccccccccc}
    &
    \rot{\textbf{Chour}}        &
    \rot{\textbf{Ferretti}}     & 
    \rot{\textbf{Le Bert}}      &  
    \rot{\textbf{Minervina}}    &
    \rot{\textbf{Nelde}}        &
    \rot{\textbf{Peng}}          &
    \rot{\textbf{Schulien}}      &
    \rot{\textbf{Shomuradova}}  &
    \rot{\textbf{Smith}}        & 
    \rot{\textbf{Snyder}}       &
    \rot{\textbf{Tarke}}        &
    \rot{\textbf{Zhang}}       \\
% \hline 
\textbf{N 104-110} 
    & \pie{90} 
    & \pie{180} 
    & \pie{360} 
    & - 
    & \pie{360} 
    & \pie{360} 
    & - & - & - & - & - & - \\
\textbf{Assay} & & & & & & & & & & & & \\
\textbf{Peptide Selection} & & & & & & & & & & & & \\
\end{tabular}
\end{table}



"""

latex_body = build_table(df)

latex = "\n".join([
    latex_header,
    latex_definitions,
    latex_body,
    latex_footer])

with open("epitope-table.tex", "w") as f:
    f.write(latex)

N 100: 7
N 300: 6
S 1200: 5
N 320: 5
N 220: 4
ORF3a 200: 4
nsp3 800: 4
M 160: 4
N 80: 4
S 260: 3
ORF3a 120: 3
N 360: 3
N 120: 3
ORF3a 100: 3
nsp13 380: 3
N 40: 3
S 860: 3
nsp7 20: 2
nsp3 500: 2
nsp9 20: 2
nsp12 720: 2
S 360: 2
nsp4 180: 2
nsp13 580: 2
N 260: 2
N 280: 2
M 40: 2
S 220: 2
S 160: 2
S 180: 2
S 200: 2
S 340: 2
S 380: 2
S 440: 2
S 500: 2
S 720: 2
S 740: 2
S 800: 2
S 1160: 2
S 1180: 2
N 0: 2
N 340: 2
M 120: 2
M 140: 2
M 200: 2
ORF3a 140: 2
ORF3a 220: 2
nsp3 1500: 2
N 200: 2
nsp3 1560: 2
nsp6 40: 2
nsp3 1780: 2
nsp3 1360: 2
N 60: 2
S 680: 2
N 240: 2
nsp3 720: 2
nsp3 540: 2
nsp12 860: 2
S 1000: 2
nsp8 140: 1
nsp3 80: 1
nsp5 160: 1
nsp2 620: 1
nsp2 100: 1
nsp9 60: 1
ORF3 60: 1
ORF3 100: 1
S 520: 1
ORF3a 180: 1
ORF7a 0: 1
ORF7a 40: 1
ORF7a 60: 1
nsp13 200: 1
nsp3 1540: 1
nsp4 0: 1
nsp3 880: 1
nsp2 560: 1
nsp3 1240: 1
nsp8 0: 1
nsp2 280: 1
nsp14 60: 1
S 20: 1
S 60: 1
S 400: 1
ORF3a 20: 1
S 1120: 1
S 240: 1
S 620: 1
nsp4 120: 1
nsp4 360: 1
S 40: 1
S 140: 1
S 300: 1
S 700: 1
S 880: 1

In [118]:
print(latex)


\documentclass{article}
\usepackage{amsmath}
\usepackage{array}
\usepackage{booktabs}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{threeparttable}
\usepackage{wasysym}
\usepackage{array}
\usepackage{stackengine}
\usepackage{tikz}

\begin{document}




\makeatletter
\newcommand{\thickhline}{%
    \noalign {\ifnum 0=`}\fi \hrule height 1pt
    \futurelet \reserved@a \@xhline
}
\newcolumntype{"}{@{\hskip\tabcolsep\vrule width 1pt\hskip\tabcolsep}}
\makeatother

% rotated text for column headers
\newcommand*\rot[1]{\hbox to1em{\hss\rotatebox[origin=br]{-60}{#1}}}

% circles 
\newcommand*\feature[1]{\ifcase#1 -\or\LEFTcircle\or\CIRCLE\fi}

% column type which is fixed width and centered
\newcolumntype{x}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}p{#1}}


\newcommand{\pie}[1]{%

\begin{tikzpicture}
 \draw (0,0) circle (1ex);\fill (1ex,0) arc (0:-#1:1ex) -- (0,0) -- cycle;
\end{tikzpicture}%
}


\begin{table}
\begin{tabular}{r cccccccc}
    &

             \rot{\tex

In [119]:
!xelatex epitope-table.tex

This is XeTeX, Version 3.14159265-2.6-0.999992 (TeX Live 2020) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./epitope-table.tex
LaTeX2e <2020-02-02> patch level 5
L3 programming layer <2020-03-06>
(/usr/local/texlive/2020/texmf-dist/tex/latex/base/article.cls
Document Class: article 2019/12/20 v1.4l Standard LaTeX document class
(/usr/local/texlive/2020/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2020/texmf-dist/tex/latex/amsmath/amsmath.sty
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2020/texmf-dist/tex/latex/amsmath/amstext.sty
(/usr/local/texlive/2020/texmf-dist/tex/latex/amsmath/amsgen.sty))
(/usr/local/texlive/2020/texmf-dist/tex/latex/amsmath/amsbsy.sty)
(/usr/local/texlive/2020/texmf-dist/tex/latex/amsmath/amsopn.sty))
(/usr/local/texlive/2020/texmf-dist/tex/latex/tools/array.sty)
(/usr/local/texlive/2020/texmf-dist/tex/latex/booktabs/booktabs.sty)
(/usr/local/texlive/2020/texmf-dist/tex/latex/

[1] (./epitope-table.aux) )
(see the transcript file for additional information)
Output written on epitope-table.pdf (1 page).
Transcript written on epitope-table.log.


In [120]:
!open epitope-table.pdf

In [76]:
s = r"\1"


In [81]:
print(s + "Hello\n" + r"\2")

\1Hello
\2


In [102]:
d = {"x": 1, "y": 2}

In [104]:
Counter(**d).most_common()

[('y', 2), ('x', 1)]