# Combine metadata, then create necessary columns for ETL pipeline based on similar notebook for Adaptive data.

There are some unique columns in the final exported notebook that aren't present in the Adaptive version:

- `expect_a_read_count_column`: Allow some studies to be exempted from read count column requirements
- `file_extension`: Allow custom file extensions for some studies. Default is tsv

Also note that `Britanova` is a special-cased study name that triggers unique ETL behavior.

Make sure the study names match the data locations on disk.

In [1]:
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import os
from joblib import Parallel, delayed

In [2]:
from malid import config, helpers, etl, get_v_sequence, io, logger
from malid.datamodels import GeneLocus, healthy_label
from malid.sample_sequences import sample_sequences
from malid.trained_model_wrappers import ConvergentClusterClassifier

# get specimen filepaths from specimen metadata list

## covid samples

In [3]:
covid_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv",
    sep="\t",
)
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak
0,A_d11,Kim_A,11,False
1,A_d17,Kim_A,17,True
2,A_d45,Kim_A,45,False
3,B_d10,Kim_B,10,False
4,B_d19,Kim_B,19,True
5,C_d6,Kim_C,6,False
6,C_d15,Kim_C,15,True
7,D_d6,Kim_D,6,False
8,D_d28,Kim_D,28,True
9,E_d23,Kim_E,23,True


In [4]:
covid_specimens.shape

(16, 4)

In [5]:
participant_df = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.participant_metadata.tsv",
    sep="\t",
)
participant_df

Unnamed: 0,participant_label,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,Kim_A,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
1,Kim_B,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
2,Kim_C,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
3,Kim_D,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
4,Kim_E,PRJNA648677,E,M,48.0,Asian,Case,Covid19,Kim
5,Kim_F,PRJNA648677,F,F,40.0,Asian,Case,Covid19,Kim
6,Kim_G,PRJNA648677,G,F,59.0,Asian,Case,Covid19,Kim


In [6]:
covid_specimens = pd.merge(
    covid_specimens, participant_df, how="left", validate="m:1", on="participant_label"
)
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,A_d11,Kim_A,11,False,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
1,A_d17,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
2,A_d45,Kim_A,45,False,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
3,B_d10,Kim_B,10,False,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
4,B_d19,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
5,C_d6,Kim_C,6,False,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
6,C_d15,Kim_C,15,True,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
7,D_d6,Kim_D,6,False,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
8,D_d28,Kim_D,28,True,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
9,E_d23,Kim_E,23,True,PRJNA648677,E,M,48.0,Asian,Case,Covid19,Kim


In [7]:
covid_specimens.shape

(16, 12)

In [8]:
covid_specimens["disease_subtype"] = (
    covid_specimens["disease"]
    + " - "
    + covid_specimens["study_name"]
    + covid_specimens["is_peak"].replace({True: "", False: " (non-peak)"})
)
covid_specimens["gene_locus"] = GeneLocus.BCR.name
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus
0,A_d11,Kim_A,11,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
1,A_d17,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
2,A_d45,Kim_A,45,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
3,B_d10,Kim_B,10,False,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
4,B_d19,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
5,C_d6,Kim_C,6,False,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
6,C_d15,Kim_C,15,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
7,D_d6,Kim_D,6,False,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
8,D_d28,Kim_D,28,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
9,E_d23,Kim_E,23,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR


In [9]:
covid_specimens = covid_specimens[covid_specimens["is_peak"]]
covid_specimens = covid_specimens[covid_specimens["study_name"] == "Kim"]
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus
1,A_d17,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
4,B_d19,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
6,C_d15,Kim_C,15,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
8,D_d28,Kim_D,28,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
9,E_d23,Kim_E,23,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
12,F_d14,Kim_F,14,True,PRJNA648677,F,F,40.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
15,G_d22,Kim_G,22,True,PRJNA648677,G,F,59.0,Asian,Covid19 - Kim,Covid19,Kim,BCR


In [10]:
# Special column:
# No read counts in Kim iReceptor Covid
covid_specimens["expect_a_read_count_column"] = False

# Study name will be prepended, so remove prefix from participant_label
covid_specimens["participant_label"] = covid_specimens["participant_label"].str.replace(
    "Kim_", ""
)

covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,expect_a_read_count_column
1,A_d17,A,17,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
4,B_d19,B,19,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
6,C_d15,C,15,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
8,D_d28,D,28,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
9,E_d23,E,23,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
12,F_d14,F,14,True,PRJNA648677,F,F,40.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False
15,G_d22,G,22,True,PRJNA648677,G,F,59.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False


## healthy specimens

In [11]:
healthy_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_bcr.participant_metadata.tsv",
    sep="\t",
)

# process peak samples only
healthy_specimens = healthy_specimens[healthy_specimens["is_peak"] == True]

healthy_specimens["disease_subtype"] = (
    healthy_specimens["disease"]
    + " - "
    + healthy_specimens["study_name"]
    + healthy_specimens["is_peak"].replace({True: "", False: " (non-peak)"})
)

healthy_specimens["gene_locus"] = GeneLocus.BCR.name

# Special columns:
# No read counts in Briney
healthy_specimens["expect_a_read_count_column"] = False
# Unusual file extension
healthy_specimens["file_extension"] = "csv"

healthy_specimens

Unnamed: 0,specimen_label,participant_label,study_name,disease,is_peak,timepoint,age,sex,ethnicity,ethnicity_condensed,disease_subtype,gene_locus,expect_a_read_count_column,file_extension
0,D103_1,D103,Briney,Healthy/Background,True,0,25,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR,False,csv
1,326780_1,326780,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR,False,csv
2,326650_1,326650,Briney,Healthy/Background,True,0,18,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR,False,csv
3,326737_1,326737,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR,False,csv
4,327059_1,327059,Briney,Healthy/Background,True,0,26,M,African American / Caucasian,,Healthy/Background - Briney,BCR,False,csv
5,326907_1,326907,Briney,Healthy/Background,True,0,29,F,African American,African,Healthy/Background - Briney,BCR,False,csv
6,316188_1,316188,Briney,Healthy/Background,True,0,30,F,African American,African,Healthy/Background - Briney,BCR,False,csv
7,326797_1,326797,Briney,Healthy/Background,True,0,21,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR,False,csv


## healthy TCR specimens

In [12]:
tcr_healthy_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_tcr_britanova.participant_metadata.tsv",
    sep="\t",
).assign(
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
)

# Special column:
# Unusual file extension
tcr_healthy_specimens["file_extension"] = "txt.gz"

tcr_healthy_specimens

Unnamed: 0,specimen_label,sex,age,ethnicity_condensed,participant_label,disease,study_name,disease_subtype,is_peak,gene_locus,file_extension
0,A3-i101,F,36,Caucasian,p1,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
1,A3-i102,F,43,Caucasian,p2,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
2,A3-i106,F,43,Caucasian,p3,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
3,A3-i107,F,39,Caucasian,p4,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
4,A3-i110,F,34,Caucasian,p5,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
5,A2-i138,F,74,Caucasian,p15,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
6,A2-i139,M,75,Caucasian,p16,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
7,A2-i140,F,73,Caucasian,p17,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
8,A2-i141,M,71,Caucasian,p18,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz
9,A4-i101,M,36,Caucasian,p19,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,txt.gz


## Covid TCR specimens

In [13]:
tcr_covid_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid_tcr_shomuradova.participant_metadata.tsv",
    sep="\t",
).assign(
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
)
tcr_covid_specimens["disease_subtype"] = tcr_covid_specimens[
    "disease_subtype"
].str.replace("Covid19 -", "Covid19 - Shomuradova -")

# Special column:
# No read counts in Shomuradova
tcr_covid_specimens["expect_a_read_count_column"] = False

tcr_covid_specimens

Unnamed: 0,specimen_label,participant_label,disease,study_name,disease_subtype,age,sex,ethnicity_condensed,is_peak,gene_locus,expect_a_read_count_column
0,5f07aa8839579433171763b4,p1437,Covid19,Shomuradova,Covid19 - Shomuradova - mild,28,M,Caucasian,True,TCR,False
1,5f07aa8939579433171763b7,p1445,Covid19,Shomuradova,Covid19 - Shomuradova - mild,32,M,Caucasian,True,TCR,False
2,5f07aa8a39579433171763ba,p1473,Covid19,Shomuradova,Covid19 - Shomuradova - mild,31,F,Caucasian,True,TCR,False
3,5f07aa8c39579433171763c0,p1489,Covid19,Shomuradova,Covid19 - Shomuradova - mild,27,M,Caucasian,True,TCR,False
4,6047f702136a6d924982945c,p1434,Covid19,Shomuradova,Covid19 - Shomuradova - mild,28,M,Caucasian,True,TCR,False
5,6047f703136a6d924982945f,p1448,Covid19,Shomuradova,Covid19 - Shomuradova - moderate/severe,37,M,Caucasian,True,TCR,False
6,6047f704136a6d9249829462,p1449,Covid19,Shomuradova,Covid19 - Shomuradova - mild,34,F,Caucasian,True,TCR,False
7,6047f704136a6d9249829465,p1465,Covid19,Shomuradova,Covid19 - Shomuradova - moderate/severe,19,M,Caucasian,True,TCR,False
8,6047f706136a6d924982946b,p1480,Covid19,Shomuradova,Covid19 - Shomuradova - moderate/severe,29,M,Caucasian,True,TCR,False
9,6047f707136a6d924982946e,p1481,Covid19,Shomuradova,Covid19 - Shomuradova - moderate/severe,30,F,Caucasian,True,TCR,False


## All of the studies we've loaded so far:

In [14]:
all_studies = [
    covid_specimens,
    healthy_specimens,
    tcr_healthy_specimens,
    tcr_covid_specimens,
]

## Add new studies here:

In [15]:
# Example:

# df = pd.DataFrame(
#     {
#         "study_name": "newstudy",
#         "participant_label": ["patient1", "patient2", "patient3"],
#         "specimen_label": ["sample1", "sample2", "sample3"],
#         # Data will be loaded from data/external_cohorts/raw_data/newstudy/sample1.tsv, data/external_cohorts/raw_data/newstudy/sample2.tsv, and so on.
#         "gene_locus": "BCR",
#         "disease": ["Covid19", "Covid19", healthy_label],
#     }
# )
# display(df)
# all_studies.append(df)

# Combine

In [16]:
dfs_external = pd.concat(
    all_studies,
    axis=0,
)
dfs_external

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,expect_a_read_count_column,ethnicity,file_extension
1,A_d17,A,17.0,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,
4,B_d19,B,19.0,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,
6,C_d15,C,15.0,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,
8,D_d28,D,28.0,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,
9,E_d23,E,23.0,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,6047f70a136a6d9249829477,p1494,,True,,,F,36.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,
13,6047f70b136a6d924982947a,p1495,,True,,,M,41.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,
14,6047f70c136a6d924982947d,p1531,,True,,,M,23.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,
15,6047f70c136a6d924982947f,p1545,,True,,,F,44.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,


In [17]:
dfs_external["disease"].value_counts()

Healthy/Background    47
Covid19               24
Name: disease, dtype: int64

In [18]:
dfs_external["disease_subtype"].value_counts()

Healthy/Background - Britanova             39
Covid19 - Shomuradova - mild               10
Healthy/Background - Briney                 8
Covid19 - Kim                               7
Covid19 - Shomuradova - moderate/severe     7
Name: disease_subtype, dtype: int64

In [19]:
dfs_external["participant_label"]

1         A
4         B
6         C
8         D
9         E
      ...  
12    p1494
13    p1495
14    p1531
15    p1545
16    p1551
Name: participant_label, Length: 71, dtype: object

In [20]:
dfs_external["specimen_label"]

1                        A_d17
4                        B_d19
6                        C_d15
8                        D_d28
9                        E_d23
                ...           
12    6047f70a136a6d9249829477
13    6047f70b136a6d924982947a
14    6047f70c136a6d924982947d
15    6047f70c136a6d924982947f
16    6047f70d136a6d9249829482
Name: specimen_label, Length: 71, dtype: object

In [21]:
assert not dfs_external["gene_locus"].isna().any()

In [22]:
# Make sure special columns are present
dfs_external["expect_a_read_count_column"].fillna(True, inplace=True)
dfs_external["file_extension"].fillna("tsv", inplace=True)
print(dfs_external["expect_a_read_count_column"].value_counts())
print(dfs_external["file_extension"].value_counts())

True     39
False    32
Name: expect_a_read_count_column, dtype: int64
txt.gz    39
tsv       24
csv        8
Name: file_extension, dtype: int64


In [23]:
# Columns:
# study_name
# participant_label
# specimen_label: globally unique, but may have several amplifications and replicates.
# amplification_label: globally unique, but may have several replicates.
# replicate_label: globally unique.
# sample_name: not globally unique, but should be unique within each study. used in the fasta header and igblast parsed "id" column.

In [24]:
assert not dfs_external["study_name"].isna().any()
assert not dfs_external["participant_label"].isna().any()
assert not dfs_external["specimen_label"].isna().any()

In [25]:
# To be consistent with boydlab columns, we'll add amplification_label, which here will always equal specimen_label.
# See sample_sequences.py for more details on how this gets used.
if "amplification_label" not in dfs_external.columns:
    dfs_external["amplification_label"] = dfs_external["specimen_label"]
else:
    # fill NA
    dfs_external["amplification_label"].fillna(
        dfs_external["specimen_label"], inplace=True
    )

# Fill replicate_label
if "replicate_label" not in dfs_external.columns:
    dfs_external["replicate_label"] = dfs_external["specimen_label"]
else:
    # fill NA
    dfs_external["replicate_label"].fillna(dfs_external["specimen_label"], inplace=True)

# Fill sample_name
if "sample_name" not in dfs_external.columns:
    dfs_external["sample_name"] = dfs_external["specimen_label"]
else:
    # fill NA
    dfs_external["sample_name"].fillna(dfs_external["specimen_label"], inplace=True)

In [26]:
dfs_external

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,expect_a_read_count_column,ethnicity,file_extension,amplification_label,replicate_label,sample_name
1,A_d17,A,17.0,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,A_d17,A_d17,A_d17
4,B_d19,B,19.0,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,B_d19,B_d19,B_d19
6,C_d15,C,15.0,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,C_d15,C_d15,C_d15
8,D_d28,D,28.0,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,D_d28,D_d28,D_d28
9,E_d23,E,23.0,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,E_d23,E_d23,E_d23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,6047f70a136a6d9249829477,p1494,,True,,,F,36.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,tsv,6047f70a136a6d9249829477,6047f70a136a6d9249829477,6047f70a136a6d9249829477
13,6047f70b136a6d924982947a,p1495,,True,,,M,41.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,tsv,6047f70b136a6d924982947a,6047f70b136a6d924982947a,6047f70b136a6d924982947a
14,6047f70c136a6d924982947d,p1531,,True,,,M,23.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,tsv,6047f70c136a6d924982947d,6047f70c136a6d924982947d,6047f70c136a6d924982947d
15,6047f70c136a6d924982947f,p1545,,True,,,F,44.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,tsv,6047f70c136a6d924982947f,6047f70c136a6d924982947f,6047f70c136a6d924982947f


In [27]:
# add study prefixes to make these labels unique to study:
for col in [
    "participant_label",
    "specimen_label",
    "amplification_label",
    "replicate_label",
]:
    dfs_external[col] = dfs_external["study_name"] + "_" + dfs_external[col].astype(str)

In [28]:
dfs_external

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,expect_a_read_count_column,ethnicity,file_extension,amplification_label,replicate_label,sample_name
1,Kim_A_d17,Kim_A,17.0,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,Kim_A_d17,Kim_A_d17,A_d17
4,Kim_B_d19,Kim_B,19.0,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,Kim_B_d19,Kim_B_d19,B_d19
6,Kim_C_d15,Kim_C,15.0,True,PRJNA648677,C,F,53.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,Kim_C_d15,Kim_C_d15,C_d15
8,Kim_D_d28,Kim_D,28.0,True,PRJNA648677,D,M,24.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,Kim_D_d28,Kim_D_d28,D_d28
9,Kim_E_d23,Kim_E,23.0,True,PRJNA648677,E,M,48.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,False,,tsv,Kim_E_d23,Kim_E_d23,E_d23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,Shomuradova_6047f70a136a6d9249829477,Shomuradova_p1494,,True,,,F,36.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,tsv,Shomuradova_6047f70a136a6d9249829477,Shomuradova_6047f70a136a6d9249829477,6047f70a136a6d9249829477
13,Shomuradova_6047f70b136a6d924982947a,Shomuradova_p1495,,True,,,M,41.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,tsv,Shomuradova_6047f70b136a6d924982947a,Shomuradova_6047f70b136a6d924982947a,6047f70b136a6d924982947a
14,Shomuradova_6047f70c136a6d924982947d,Shomuradova_p1531,,True,,,M,23.0,Caucasian,Covid19 - Shomuradova - mild,Covid19,Shomuradova,TCR,False,,tsv,Shomuradova_6047f70c136a6d924982947d,Shomuradova_6047f70c136a6d924982947d,6047f70c136a6d924982947d
15,Shomuradova_6047f70c136a6d924982947f,Shomuradova_p1545,,True,,,F,44.0,Caucasian,Covid19 - Shomuradova - moderate/severe,Covid19,Shomuradova,TCR,False,,tsv,Shomuradova_6047f70c136a6d924982947f,Shomuradova_6047f70c136a6d924982947f,6047f70c136a6d924982947f


In [29]:
# confirm one entry per replicate label per locus, at most!
# (specimens can have multiple replicates, e.g. cell type subsets that get merged.)
# (participants can have multiple specimens, e.g. separate time points)
assert (dfs_external.groupby(["gene_locus", "replicate_label"]).size() == 1).all()

In [30]:
dfs_external["participant_label"].unique()

array(['Kim_A', 'Kim_B', 'Kim_C', 'Kim_D', 'Kim_E', 'Kim_F', 'Kim_G',
       'Briney_D103', 'Briney_326780', 'Briney_326650', 'Briney_326737',
       'Briney_327059', 'Briney_326907', 'Briney_316188', 'Briney_326797',
       'Britanova_p1', 'Britanova_p2', 'Britanova_p3', 'Britanova_p4',
       'Britanova_p5', 'Britanova_p15', 'Britanova_p16', 'Britanova_p17',
       'Britanova_p18', 'Britanova_p19', 'Britanova_p20', 'Britanova_p21',
       'Britanova_p22', 'Britanova_p23', 'Britanova_p26', 'Britanova_p27',
       'Britanova_p29', 'Britanova_p32', 'Britanova_p33', 'Britanova_p34',
       'Britanova_p35', 'Britanova_p36', 'Britanova_p37', 'Britanova_p38',
       'Britanova_p39', 'Britanova_p40', 'Britanova_p41', 'Britanova_p43',
       'Britanova_p52', 'Britanova_p53', 'Britanova_p54', 'Britanova_p56',
       'Britanova_p70', 'Britanova_p71', 'Britanova_p72', 'Britanova_p76',
       'Shomuradova_p1437', 'Shomuradova_p1445', 'Shomuradova_p1473',
       'Shomuradova_p1489', 'Shomuradova_p

In [31]:
dfs_external["sequencing_type"] = "cDNA"

In [32]:
dfs_external.groupby(["sequencing_type", "gene_locus", "disease"], observed=True)[
    "participant_label"
].nunique().to_frame().sort_values("participant_label")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_label
sequencing_type,gene_locus,disease,Unnamed: 3_level_1
cDNA,BCR,Covid19,7
cDNA,BCR,Healthy/Background,8
cDNA,TCR,Covid19,17
cDNA,TCR,Healthy/Background,36


In [33]:
dfs_external["disease_subtype"].isna().any()

False

In [34]:
dfs_external["disease"].isna().any()

False

In [35]:
dfs_external["disease_subtype"].fillna(dfs_external["disease"], inplace=True)

In [36]:
dfs_external.isna().any()[dfs_external.isna().any()]

timepoint                  True
study_id                   True
patient_id_within_study    True
ethnicity_condensed        True
ethnicity                  True
dtype: bool

In [37]:
dfs_external["disease_subtype"].value_counts()

Healthy/Background - Britanova             39
Covid19 - Shomuradova - mild               10
Healthy/Background - Briney                 8
Covid19 - Kim                               7
Covid19 - Shomuradova - moderate/severe     7
Name: disease_subtype, dtype: int64

In [38]:
dfs_external[dfs_external["disease_subtype"] == healthy_label][
    "study_name"
].value_counts()

Series([], Name: study_name, dtype: int64)

In [39]:
dfs_external.groupby(["gene_locus", "disease", "disease_subtype"], observed=True)[
    "participant_label"
].nunique().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_label
gene_locus,disease,disease_subtype,Unnamed: 3_level_1
BCR,Covid19,Covid19 - Kim,7
BCR,Healthy/Background,Healthy/Background - Briney,8
TCR,Covid19,Covid19 - Shomuradova - mild,10
TCR,Covid19,Covid19 - Shomuradova - moderate/severe,7
TCR,Healthy/Background,Healthy/Background - Britanova,36


In [40]:
dfs_external.groupby(
    ["gene_locus", "disease", "disease_subtype", "study_name"], observed=True
)["participant_label"].nunique().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,participant_label
gene_locus,disease,disease_subtype,study_name,Unnamed: 4_level_1
BCR,Covid19,Covid19 - Kim,Kim,7
BCR,Healthy/Background,Healthy/Background - Briney,Briney,8
TCR,Covid19,Covid19 - Shomuradova - mild,Shomuradova,10
TCR,Covid19,Covid19 - Shomuradova - moderate/severe,Shomuradova,7
TCR,Healthy/Background,Healthy/Background - Britanova,Britanova,36


In [41]:
dfs_external.groupby("disease")["participant_label"].nunique().sort_values()

disease
Covid19               24
Healthy/Background    44
Name: participant_label, dtype: int64

In [42]:
# Review which replicates are getting combined into which specimens
# dfs_external[dfs_external['replicate_label'] != dfs_external['specimen_label']].groupby('specimen_label')['replicate_label'].unique().tolist()
dfs_external[dfs_external["replicate_label"] != dfs_external["specimen_label"]][
    ["specimen_label", "replicate_label"]
]

Unnamed: 0,specimen_label,replicate_label


In [43]:
# # Review which replicates are getting combined into which specimens
# replicates_being_merged_into_same_specimen = (
#     dfs_external[dfs_external["replicate_label"] != dfs_external["specimen_label"]]
#     .groupby("specimen_label")["replicate_label"]
#     .unique()
#     .apply(pd.Series)
# )
# # remove rows where single replicate (but just happened to have different label) - no merging happening
# replicates_being_merged_into_same_specimen = replicates_being_merged_into_same_specimen[
#     replicates_being_merged_into_same_specimen.notna().sum(axis=1) > 1
# ]
# replicates_being_merged_into_same_specimen

In [44]:
# all available columns, in case-insensitive sorted order
dfs_external.columns.sort_values(key=lambda idx: idx.str.lower())

Index(['age', 'amplification_label', 'disease', 'disease_subtype', 'ethnicity',
       'ethnicity_condensed', 'expect_a_read_count_column', 'file_extension',
       'gene_locus', 'is_peak', 'participant_label', 'patient_id_within_study',
       'replicate_label', 'sample_name', 'sequencing_type', 'sex',
       'specimen_label', 'study_id', 'study_name', 'timepoint'],
      dtype='object')

In [45]:
# specimen description can come in several fields:
specimen_description_fields = ["timepoint"]

# They are either all NA or one is set. Never have multiple of these set:
assert dfs_external[specimen_description_fields].notna().sum(axis=1).max()

# So we can just take first non-null value (if any) per row from these columns (https://stackoverflow.com/a/37938780/130164):
dfs_external["specimen_description"] = (
    dfs_external[specimen_description_fields].fillna(method="bfill", axis=1).iloc[:, 0]
)
dfs_external["specimen_description"]

1     17.0
4     19.0
6     15.0
8     28.0
9     23.0
      ... 
12     NaN
13     NaN
14     NaN
15     NaN
16     NaN
Name: specimen_description, Length: 71, dtype: float64

In [46]:
# Set has_BCR, has_TCR
dfs_external["has_BCR"] = False
dfs_external["has_TCR"] = False
dfs_external.loc[dfs_external["gene_locus"] == "BCR", "has_BCR"] = True
dfs_external.loc[dfs_external["gene_locus"] == "TCR", "has_TCR"] = True

# should always be one or the other:
assert (dfs_external["has_BCR"] != dfs_external["has_TCR"]).all()

print(dfs_external["has_BCR"].value_counts())
print(dfs_external["has_TCR"].value_counts())

False    56
True     15
Name: has_BCR, dtype: int64
True     56
False    15
Name: has_TCR, dtype: int64


In [47]:
# Subset to these surviving columns
dfs_external = dfs_external[
    [
        "study_name",
        "sample_name",
        "gene_locus",
        "disease",
        "sequencing_type",
        "disease_subtype",
        "participant_label",
        "specimen_label",
        "amplification_label",
        "replicate_label",
        "sex",
        "age",
        "ethnicity",
        "ethnicity_condensed",
        "specimen_description",
        "has_BCR",
        "has_TCR",
        # specials:
        "expect_a_read_count_column",
        "file_extension",
    ]
].copy()
dfs_external

Unnamed: 0,study_name,sample_name,gene_locus,disease,sequencing_type,disease_subtype,participant_label,specimen_label,amplification_label,replicate_label,sex,age,ethnicity,ethnicity_condensed,specimen_description,has_BCR,has_TCR,expect_a_read_count_column,file_extension
1,Kim,A_d17,BCR,Covid19,cDNA,Covid19 - Kim,Kim_A,Kim_A_d17,Kim_A_d17,Kim_A_d17,M,55.0,,Asian,17.0,True,False,False,tsv
4,Kim,B_d19,BCR,Covid19,cDNA,Covid19 - Kim,Kim_B,Kim_B_d19,Kim_B_d19,Kim_B_d19,M,55.0,,Asian,19.0,True,False,False,tsv
6,Kim,C_d15,BCR,Covid19,cDNA,Covid19 - Kim,Kim_C,Kim_C_d15,Kim_C_d15,Kim_C_d15,F,53.0,,Asian,15.0,True,False,False,tsv
8,Kim,D_d28,BCR,Covid19,cDNA,Covid19 - Kim,Kim_D,Kim_D_d28,Kim_D_d28,Kim_D_d28,M,24.0,,Asian,28.0,True,False,False,tsv
9,Kim,E_d23,BCR,Covid19,cDNA,Covid19 - Kim,Kim_E,Kim_E_d23,Kim_E_d23,Kim_E_d23,M,48.0,,Asian,23.0,True,False,False,tsv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,Shomuradova,6047f70a136a6d9249829477,TCR,Covid19,cDNA,Covid19 - Shomuradova - mild,Shomuradova_p1494,Shomuradova_6047f70a136a6d9249829477,Shomuradova_6047f70a136a6d9249829477,Shomuradova_6047f70a136a6d9249829477,F,36.0,,Caucasian,,False,True,False,tsv
13,Shomuradova,6047f70b136a6d924982947a,TCR,Covid19,cDNA,Covid19 - Shomuradova - moderate/severe,Shomuradova_p1495,Shomuradova_6047f70b136a6d924982947a,Shomuradova_6047f70b136a6d924982947a,Shomuradova_6047f70b136a6d924982947a,M,41.0,,Caucasian,,False,True,False,tsv
14,Shomuradova,6047f70c136a6d924982947d,TCR,Covid19,cDNA,Covid19 - Shomuradova - mild,Shomuradova_p1531,Shomuradova_6047f70c136a6d924982947d,Shomuradova_6047f70c136a6d924982947d,Shomuradova_6047f70c136a6d924982947d,M,23.0,,Caucasian,,False,True,False,tsv
15,Shomuradova,6047f70c136a6d924982947f,TCR,Covid19,cDNA,Covid19 - Shomuradova - moderate/severe,Shomuradova_p1545,Shomuradova_6047f70c136a6d924982947f,Shomuradova_6047f70c136a6d924982947f,Shomuradova_6047f70c136a6d924982947f,F,44.0,,Caucasian,,False,True,False,tsv


In [48]:
all_specimens = dfs_external

# Make metadata columns consistent with standard Boydlab pipeline

In [49]:
all_specimens["sex"].value_counts()

M    36
F    35
Name: sex, dtype: int64

In [50]:
all_specimens["ethnicity"].isna().value_counts()

True     63
False     8
Name: ethnicity, dtype: int64

In [51]:
# Here's who is missing ethnicity:
all_specimens[all_specimens["ethnicity"].isna()]["disease"].value_counts()

Healthy/Background    39
Covid19               24
Name: disease, dtype: int64

In [52]:
# Here's who is missing ethnicity:
all_specimens[all_specimens["ethnicity"].isna()]["study_name"].value_counts()

Britanova      39
Shomuradova    17
Kim             7
Name: study_name, dtype: int64

In [53]:
all_specimens["ethnicity_condensed"].value_counts()

Caucasian    61
Asian         7
African       2
Name: ethnicity_condensed, dtype: int64

In [54]:
all_specimens["ethnicity_condensed"].isna().value_counts()

False    70
True      1
Name: ethnicity_condensed, dtype: int64

In [55]:
# Here's who is missing ethnicity_condensed:
all_specimens[all_specimens["ethnicity_condensed"].isna()]["disease"].value_counts()

Healthy/Background    1
Name: disease, dtype: int64

In [56]:
# Here's who is missing ethnicity_condensed:
# *Important*: If we see entries here that can be resolved, update the ethnicity_condensed rules above.
all_specimens[all_specimens["ethnicity_condensed"].isna()]["ethnicity"].value_counts()

African American / Caucasian    1
Name: ethnicity, dtype: int64

In [57]:
# Here's who is missing ethnicity_condensed:
all_specimens[all_specimens["ethnicity_condensed"].isna()]["study_name"].value_counts()

Briney    1
Name: study_name, dtype: int64

In [58]:
# Versus total counts
all_specimens["disease"].value_counts()

Healthy/Background    47
Covid19               24
Name: disease, dtype: int64

In [59]:
all_specimens.groupby(["ethnicity_condensed", "disease"]).size()

ethnicity_condensed  disease           
African              Healthy/Background     2
Asian                Covid19                7
Caucasian            Covid19               17
                     Healthy/Background    44
dtype: int64

In [60]:
all_specimens["age"].dropna()

1     55.0
4     55.0
6     53.0
8     24.0
9     48.0
      ... 
12    36.0
13    41.0
14    23.0
15    44.0
16    24.0
Name: age, Length: 71, dtype: float64

In [61]:
# Set age_group column as well, just as in assemble_etl_metadata
all_specimens["age"].describe()

count    71.000000
mean     40.985915
std      15.215117
min      18.000000
25%      28.500000
50%      39.000000
75%      51.000000
max      75.000000
Name: age, dtype: float64

In [62]:
all_specimens["age_group"] = pd.cut(
    all_specimens["age"],
    bins=[0, 20, 30, 40, 50, 60, 70, 80, 100],
    labels=["<20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80+"],
    right=False,
)
all_specimens["age_group"].value_counts()

20-30    20
30-40    15
50-60    13
40-50    10
60-70     7
70-80     4
<20       2
80+       0
Name: age_group, dtype: int64

In [63]:
all_specimens["age_group"].cat.categories

Index(['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'], dtype='object')

In [64]:
all_specimens["age"].isna().value_counts()

False    71
Name: age, dtype: int64

In [65]:
all_specimens["age_group"].isna().value_counts()

False    71
Name: age_group, dtype: int64

In [66]:
for age_group, grp in all_specimens.groupby("age_group"):
    print(age_group, grp["age"].min(), grp["age"].max())

<20 18.0 19.0
20-30 20.0 29.0
30-40 30.0 39.0
40-50 40.0 48.0
50-60 50.0 59.0
60-70 61.0 66.0
70-80 71.0 75.0
80+ nan nan


In [67]:
# Just as in assemble_etl_metadata:
# Null out "age_group" column for extreme ages with small sample size.

# Note that we are not getting rid of these specimens altogether,
# but marking age_group NaN will disable their use for demographics-controlling models

orig_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()
mask = all_specimens["age_group"].isin(["80+"])
if mask.sum() > 0:
    all_specimens.loc[mask, "age_group"] = np.nan
    new_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()

    # sanity checks:
    # - we did not drop any specimens
    assert orig_shapes[0] == new_shapes[0]
    # - but we did null out some age_group entries
    assert orig_shapes[1] < new_shapes[1]
    # - we nulled out the right amount
    assert new_shapes[1] - orig_shapes[1] == mask.sum()

In [68]:
# # Fillna for cohorts that are single-locus
# if "specimen_label_by_locus" not in all_specimens:
#     # in case we had no BCR+TCR combined cohorts that set this field already
#     all_specimens["specimen_label_by_locus"] = all_specimens["specimen_label"]
# else:
#     all_specimens["specimen_label_by_locus"].fillna(
#         all_specimens["specimen_label"], inplace=True
#     )

In [69]:
# # make sure input fnames exist
# assert all_specimens["fname"].apply(os.path.exists).all()

In [70]:
all_specimens.shape

(71, 20)

In [71]:
# # confirm all specimen labels are unique within each locus (may have one BCR and one TCR line per specimen)
# # TODO: in the future, allow for replicates of each specimen
# assert not all_specimens["specimen_label_by_locus"].duplicated().any()
# for locus, grp in all_specimens.groupby("gene_locus"):
#     assert not grp["specimen_label"].duplicated().any()

In [72]:
# # Which specimens are in multiple loci?
# all_specimens[all_specimens["specimen_label"].duplicated(keep=False)]

In [73]:
all_specimens["study_name"].value_counts()

Britanova      39
Shomuradova    17
Briney          8
Kim             7
Name: study_name, dtype: int64

In [74]:
all_specimens["disease"].value_counts()

Healthy/Background    47
Covid19               24
Name: disease, dtype: int64

In [75]:
all_specimens["gene_locus"].value_counts()

TCR    56
BCR    15
Name: gene_locus, dtype: int64

In [76]:
all_specimens["disease_subtype"].value_counts()

Healthy/Background - Britanova             39
Covid19 - Shomuradova - mild               10
Healthy/Background - Briney                 8
Covid19 - Kim                               7
Covid19 - Shomuradova - moderate/severe     7
Name: disease_subtype, dtype: int64

In [77]:
for key, grp in all_specimens.groupby("disease"):
    print(key)
    print(grp["disease_subtype"].value_counts())
    print()

Covid19
Covid19 - Shomuradova - mild               10
Covid19 - Kim                               7
Covid19 - Shomuradova - moderate/severe     7
Name: disease_subtype, dtype: int64

Healthy/Background
Healthy/Background - Britanova    39
Healthy/Background - Briney        8
Name: disease_subtype, dtype: int64



In [78]:
for key, grp in all_specimens.groupby("disease"):
    print(key)
    print(grp["specimen_description"].value_counts())
    print()

Covid19
17.0    1
19.0    1
15.0    1
28.0    1
23.0    1
14.0    1
22.0    1
Name: specimen_description, dtype: int64

Healthy/Background
0.0    8
Name: specimen_description, dtype: int64



In [79]:
for demographics_column in ["age", "age_group", "sex", "ethnicity_condensed"]:
    print(demographics_column)
    print(all_specimens[demographics_column].value_counts())
    print(all_specimens[demographics_column].isna().value_counts())
    print()

age
51.0    6
36.0    4
29.0    4
43.0    4
61.0    4
55.0    3
24.0    3
30.0    3
25.0    3
39.0    2
50.0    2
37.0    2
27.0    2
28.0    2
34.0    2
21.0    2
23.0    1
32.0    1
22.0    1
47.0    1
31.0    1
19.0    1
41.0    1
20.0    1
46.0    1
71.0    1
62.0    1
64.0    1
66.0    1
53.0    1
73.0    1
75.0    1
74.0    1
26.0    1
18.0    1
59.0    1
40.0    1
48.0    1
44.0    1
Name: age, dtype: int64
False    71
Name: age, dtype: int64

age_group
20-30    20
30-40    15
50-60    13
40-50    10
60-70     7
70-80     4
<20       2
80+       0
Name: age_group, dtype: int64
False    71
Name: age_group, dtype: int64

sex
M    36
F    35
Name: sex, dtype: int64
False    71
Name: sex, dtype: int64

ethnicity_condensed
Caucasian    61
Asian         7
African       2
Name: ethnicity_condensed, dtype: int64
False    70
True      1
Name: ethnicity_condensed, dtype: int64



In [80]:
all_specimens.drop(columns=["ethnicity"]).to_csv(
    config.paths.metadata_dir / "generated.external_cohorts.tsv",
    sep="\t",
    index=None,
)

In [81]:
all_specimens

Unnamed: 0,study_name,sample_name,gene_locus,disease,sequencing_type,disease_subtype,participant_label,specimen_label,amplification_label,replicate_label,sex,age,ethnicity,ethnicity_condensed,specimen_description,has_BCR,has_TCR,expect_a_read_count_column,file_extension,age_group
1,Kim,A_d17,BCR,Covid19,cDNA,Covid19 - Kim,Kim_A,Kim_A_d17,Kim_A_d17,Kim_A_d17,M,55.0,,Asian,17.0,True,False,False,tsv,50-60
4,Kim,B_d19,BCR,Covid19,cDNA,Covid19 - Kim,Kim_B,Kim_B_d19,Kim_B_d19,Kim_B_d19,M,55.0,,Asian,19.0,True,False,False,tsv,50-60
6,Kim,C_d15,BCR,Covid19,cDNA,Covid19 - Kim,Kim_C,Kim_C_d15,Kim_C_d15,Kim_C_d15,F,53.0,,Asian,15.0,True,False,False,tsv,50-60
8,Kim,D_d28,BCR,Covid19,cDNA,Covid19 - Kim,Kim_D,Kim_D_d28,Kim_D_d28,Kim_D_d28,M,24.0,,Asian,28.0,True,False,False,tsv,20-30
9,Kim,E_d23,BCR,Covid19,cDNA,Covid19 - Kim,Kim_E,Kim_E_d23,Kim_E_d23,Kim_E_d23,M,48.0,,Asian,23.0,True,False,False,tsv,40-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,Shomuradova,6047f70a136a6d9249829477,TCR,Covid19,cDNA,Covid19 - Shomuradova - mild,Shomuradova_p1494,Shomuradova_6047f70a136a6d9249829477,Shomuradova_6047f70a136a6d9249829477,Shomuradova_6047f70a136a6d9249829477,F,36.0,,Caucasian,,False,True,False,tsv,30-40
13,Shomuradova,6047f70b136a6d924982947a,TCR,Covid19,cDNA,Covid19 - Shomuradova - moderate/severe,Shomuradova_p1495,Shomuradova_6047f70b136a6d924982947a,Shomuradova_6047f70b136a6d924982947a,Shomuradova_6047f70b136a6d924982947a,M,41.0,,Caucasian,,False,True,False,tsv,40-50
14,Shomuradova,6047f70c136a6d924982947d,TCR,Covid19,cDNA,Covid19 - Shomuradova - mild,Shomuradova_p1531,Shomuradova_6047f70c136a6d924982947d,Shomuradova_6047f70c136a6d924982947d,Shomuradova_6047f70c136a6d924982947d,M,23.0,,Caucasian,,False,True,False,tsv,20-30
15,Shomuradova,6047f70c136a6d924982947f,TCR,Covid19,cDNA,Covid19 - Shomuradova - moderate/severe,Shomuradova_p1545,Shomuradova_6047f70c136a6d924982947f,Shomuradova_6047f70c136a6d924982947f,Shomuradova_6047f70c136a6d924982947f,F,44.0,,Caucasian,,False,True,False,tsv,40-50


In [82]:
all_specimens["study_name"].value_counts()

Britanova      39
Shomuradova    17
Briney          8
Kim             7
Name: study_name, dtype: int64