# Setup

Edit the file paths and/or options in the first chunk if needed.

Note that to use the REDCap functions, you'll need to create a file in this format:

{
	"Cho Lab Single Cell Sample Metadatabase": {
		"url": "https://redcap.mountsinai.org/redcap/api/",
		"token": "<TOKEN HERE (see API tab on RC)>",
		"content": "project",
		"format": "json",
		"returnFormat": "json"
	}

}


## Imports & Options

In [618]:
import requests
import json
import os
import re
import pandas as pd
import numpy as np
from functions import (get_google_sheet, get_redcap_metadata, try_float,
                       investigate_fields, extract_categories, search_fields)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 500)

# path_file = None
path_file = "sc-metadata-cleanup.csv"
path_config = os.path.join(os.path.expanduser("~"), ".ssh/config_redcap.json")
path_secret = os.path.join(os.path.expanduser("~"),
                           ".ssh/client_secret_google_sheets.json")
dd_dict_csv = os.path.join(
    os.path.expanduser("~"), str("Downloads/ChoLabSingleCellSampleMetadata_"
                                 "DataDictionary_2023-12-01.csv"))
rc_project = "Cho Lab Single Cell Sample Metadatabase"
redcap_event_name = "forms_arm_1"
id_sheet = "1PV2vPHjBWxj3Hn0od1B78q98Q6yoyEbj6ku9obzYr2I"
unique_id = "lib_id"
rm_cols_collapsed = ["project_owner_id"]  # collapsing into "POID_1,POID_2", etc.
cols_should_be_unique = ["standard_sample_id"]
cols_subject = ["record_id", "record_id1", "grid", "patient_id"]
# cols_subject = ["record_id", "grid"]
# fields_cat = ["disease", "disease_status", "organism", "animal_line", 
#               "x_chem_version_sc", "inflam_status",
#               "tissue_origin", "index_kit",  "instrument"]
missing_ok = []  # TODO: add missing_ok columns

## Data

In [619]:
# Load Google Sheets Database
key_cols = list(pd.unique(cols_should_be_unique + cols_subject))
if path_file is None:
    dff = get_google_sheet(id_sheet, path_secret)
    if any(dff.duplicated(unique_id)):
        raise ValueError(f"{unique_id} is repeated and can""t be used as the true unique id column.")
    else:
        if unique_id in cols_should_be_unique:
            cols_should_be_unique.remove(unique_id)
else:
    dff = pd.read_csv(path_file)
dff = dff.set_index(unique_id)
print(dff)

# Load REDCap Configuration File
with open(path_config, "r") as json_file:
    config = json.load(json_file)

              record_id         record_id1   organism               project  \
lib_id                                                                        
HH0001       HH_Hu_MO13  Sabic_Project_020      Human      CRISPR Screening   
J00002        JM_Ze_Lck         Sabic_7719  Zebrafish  Zebrafish Felix_Josh   
J00003        JM_Ze_Lck         Sabic_7719  Zebrafish  Zebrafish Felix_Josh   
J00004        JM_Ze_Lck         Sabic_7719  Zebrafish  Zebrafish Felix_Josh   
J00005        JM_Ze_Lck         Sabic_7719  Zebrafish  Zebrafish Felix_Josh   
LC0001        FC_Hu_sCD  Sabic_Project_022      Human                  PBMC   
LC0002        FC_Hu_sCD  Sabic_Project_021      Human                  PBMC   
LC0003        FC_Hu_sUC  Sabic_Project_022      Human                  PBMC   
LC0004        FC_Hu_sUC  Sabic_Project_021      Human                  PBMC   
LC0005         FC_Hu_sH  Sabic_Project_022      Human                  PBMC   
LC0006         FC_Hu_sH  Sabic_Project_021      Huma

## REDCap Information

In [620]:
pd.set_option("display.max_colwidth", 50)
project = "Cho Lab Single Cell Sample Metadatabase"
api_url, token = config[project]["url"], config[project]["token"]
drc = get_redcap_metadata(project, api_url, token)
print(drc.keys())
data_dict = pd.concat([pd.Series(x, name=x["field_name"]) 
                       for x in drc["data_dictionary"]], axis=1).T
print(data_dict.head())
pd.set_option("display.max_colwidth", 500)

HTTP Status: 200
HTTP Status: 200
dict_keys(['field_names', 'record_ids', 'data_dictionary'])
          field_name                          form_name section_header  \
record_id  record_id  sample_information_3_gex_multiome                  
lib_id_1    lib_id_1  sample_information_3_gex_multiome                  
lib_id_2    lib_id_2  sample_information_3_gex_multiome                  
lib_id_3    lib_id_3  sample_information_3_gex_multiome                  
lib_id_4    lib_id_4  sample_information_3_gex_multiome                  

          field_type                                        field_label  \
record_id       text                                          Record ID   
lib_id_1        text  ID UNIQUE TO THE SAMPLE AND MEASUREMENT. This ...   
lib_id_2        text                                                      
lib_id_3        text                                                      
lib_id_4        text                                                      

          

In [621]:
fields_cat = list(data_dict[data_dict.field_type.isin(["radio", "dropdown"])].index)
fields_cat = list(set(pd.unique([re.sub("_[0-9]+", "", x) 
                                 for x in fields_cat])).intersection(dff.columns))
for x in fields_cat:
    investigate_fields(x, data_rc=data_dict, data_meta=dff, pattern=False)

False
project




project



field_name                                  project
form_name         sample_information_3_gex_multiome
section_header                                     
field_type                                 dropdown
field_label                                 Project
Name: project, dtype: object

********************************************************************************
Branching Logic:

 

********************************************************************************
Categories:

 1, Ileal CD | 2, UC | 3, Zebrafish | 4, Mouse Organoid | 5, PBMC | 6, Perianal CD | 7, Ileal CD/PTGER4 | 8, PSC-IBD | 9, CRISPR Screening of Monocytes | 10, Ileal CD gp130 | 888, Other, please specify

********************************************************************************
Unique Metadatabase Values:

 ['CRISPR Screening' 'Zebrafish Felix_Josh' 'PBMC' 'Perianal CD'
 'Zebrafish Shikha' 'Ileal CD gp130' 'UC_ZC_sc' 'Mouse Organoid' 'UC'
 'Ileal CD' 'Ileal CD/PTGER4']
False


# Database Information

## Repeated Measures

### Improperly Repeated Values

In [622]:
dups = pd.concat([dff.duplicated(subset=x, keep=False) for x in cols_should_be_unique], 
                 keys=cols_should_be_unique, axis=1)  # detect improper duplicates
dups = dups[dups.T.any().T]  # only keep rows where at least 1 improper duplicate
dups = dups.apply(lambda x: x.replace(True, x.name).replace(False, np.nan)).apply(
    lambda y: str(y.dropna().iloc[0]) if any(pd.isnull(y)) else ", ".join(list(y)), 
    axis=1)  # series with text saying which columns duplicated for each row
dff_dup = dff.loc[dups.index].join(dups.to_frame("duplicates"))
print(dff_dup[["duplicates"] + key_cols])

                   duplicates standard_sample_id     record_id  \
lib_id                                                           
LC0001     standard_sample_id        FC_Hu_sCD_1     FC_Hu_sCD   
LC0002     standard_sample_id        FC_Hu_sCD_1     FC_Hu_sCD   
LC0003     standard_sample_id        FC_Hu_sUC_1     FC_Hu_sUC   
LC0004     standard_sample_id        FC_Hu_sUC_1     FC_Hu_sUC   
LC0005     standard_sample_id         FC_Hu_sH_1      FC_Hu_sH   
LC0006     standard_sample_id         FC_Hu_sH_1      FC_Hu_sH   
RL0001     standard_sample_id      RL_Hu_AA2_Inf     RL_Hu_AA2   
RL0001_re  standard_sample_id      RL_Hu_AA2_Inf     RL_Hu_AA2   
RL0002     standard_sample_id      RL_Hu_AA2_Non     RL_Hu_AA2   
RL0002_re  standard_sample_id      RL_Hu_AA2_Non     RL_Hu_AA2   
RL0005     standard_sample_id      RL_Hu_AA3_Inf     RL_Hu_AA3   
RL0006     standard_sample_id      RL_Hu_AA3_Non     RL_Hu_AA3   
RL0007     standard_sample_id      RL_Hu_AA3_Inf     RL_Hu_AA3   
RL0008    

#### To Expand/Already Expanded REDCap Fields

In [623]:
expansion = dff.groupby(cols_subject[0]).apply(
    lambda x: list(pd.Series([c if len(pd.unique(x[c])) > 1 else np.nan 
                         for c in dff.columns]).dropna())).apply(
                             lambda y: np.nan if len(y) == 0 else y).dropna()
cols_to_expand = list(pd.unique(expansion.explode()))
for x in rm_cols_collapsed:
    if x in cols_to_expand:
        cols_to_expand.remove(x)
already_expanded = dict(zip(cols_to_expand, [] * len(cols_to_expand)))
for i in cols_to_expand:
    already_expanded[i] = list(pd.Series([
        d if i in d and i != d else np.nan for d in data_dict.index]).dropna())
_ = [print(f"{k}: {already_expanded[k]}") for k in already_expanded]
already_expanded_yes = list(pd.Series([k if len(
    already_expanded[k]) > 0 else np.nan for k in already_expanded]).dropna())
# print(expansion)
cols_need = list(set(cols_to_expand).difference(
    set(already_expanded_yes)).intersection(
        data_dict.index))  # expand to accommodate repeated measures
print(f"\n\n\n{'=' * 80}\n\nCreate RM columns in REDCap for: {', '.join(cols_need)}")

standard_sample_id: ['standard_sample_id_1', 'standard_sample_id_2', 'standard_sample_id_3', 'standard_sample_id_4', 'standard_sample_id_5', 'standard_sample_id_6', 'standard_sample_id_7', 'standard_sample_id_8']
inflam_status: ['inflam_status_1', 'other_inflam_status_1', 'inflam_status_chronicity_1', 'inflam_status_2', 'other_inflam_status_2', 'inflam_status_chronicity_2', 'inflam_status_3', 'other_inflam_status_3', 'inflam_status_chronicity_3', 'inflam_status_4', 'other_inflam_status_4', 'inflam_status_5', 'other_inflam_status_5', 'inflam_status_6', 'other_inflam_status_6', 'inflam_status_7', 'other_inflam_status_7', 'inflam_status_8', 'other_inflam_status_8']
no_live_cells: ['no_live_cells_1', 'no_live_cells_2', 'no_live_cells_3', 'no_live_cells_4', 'no_live_cells_5', 'no_live_cells_6', 'no_live_cells_7', 'no_live_cells_8']
cell_viability_percentage: ['cell_viability_percentage_1', 'cell_viability_percentage_2', 'cell_viability_percentage_3', 'cell_viability_percentage_4', 'cell_via

In [624]:
if "patient_id" in cols_need:
    cols_need.remove("patient_id")

#### (In)Varying Values

In [625]:
unique_val_ct = dff.groupby(cols_subject[0]).apply(
    lambda x: pd.Series([np.nan]) if len(x) == 1 else pd.Series(
        [len(x[c].unique())for c in already_expanded], 
        index=cols_to_expand)).dropna()
invarying_vals = unique_val_ct[unique_val_ct <= 1].dropna().groupby(cols_subject[0]).apply(
    lambda x: ", ".join(list([str(i) for i in x.reset_index(0).index.values])))
varying_vals = unique_val_ct[unique_val_ct > 1].dropna().groupby(cols_subject[0]).apply(
    lambda x: ", ".join(list([str(i) for i in x.reset_index(0).index.values])))
print(f"\n\n{'=' * 80}\n\nInvarying\n\n{'=' * 80}\n{invarying_vals}")
print(f"\n\n{'=' * 80}\n\nVarying\n\n{'=' * 80}\n{varying_vals}")




Invarying

record_id
FC_Hu_IL1     no_live_cells, cell_viability_percentage, index_kit, record_id1, type_of_experiment, no_live_nuclei, no_nuclei, date_sent, instrument, ref_genome, tissue_origin, data_link, sc_process_date, inflam_treatment, inflam_treatment_type, pre_amp_date, targ_cell, x_chem_version_sc, repeat_data_release, repeat_seq_platform, inflam_treatment_concentration_DSS, inflam_treatment_quantity_DSS, inflam_treatment_quantity_BZA
FC_Hu_IL2                                                          record_id1, type_of_experiment, no_live_nuclei, no_nuclei, date_sent, instrument, ref_genome, tissue_origin, data_link, sc_process_date, inflam_treatment, inflam_treatment_type, pre_amp_date, targ_cell, x_chem_version_sc, repeat_data_release, repeat_seq_platform, inflam_treatment_concentration_DSS, inflam_treatment_quantity_DSS, inflam_treatment_quantity_BZA
FC_Hu_IL3                                                          record_id1, type_of_experiment, no_live_nuclei, no_nu

# New Data Dictionary

## Repeated Measures Fields

### Create

In [626]:
data_dict_old = data_dict.copy()
max_rm = max([max([try_float(i.split("_")[-1]) for i in already_expanded[x]]) 
              if already_expanded[x] else np.nan 
              for x in already_expanded])  # maximum # repeated measures
for x in cols_need:
    for i in range(1, int(max_rm) + 1):
        data_dict.loc[x + f"_{i}", ] = data_dict.loc[x, ]
    data_dict = data_dict.drop(x)
# data_dict

could not convert string to float: 'scm'


### Move

In [627]:
# TODO

# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO
# TODOTODOTODOTODOTODOTODOTODOTODOTODO

## Categories

In [628]:
cat_fields = list(set(pd.Series([f if data_dict.loc[f].field_type in [
    "radio", "dropdown"] else np.nan for f in data_dict.index]).dropna(
        )).intersection(set(list(dff.columns))))  # categorical fields
cat_dict = dict(zip(cat_fields, [
    dict(pd.DataFrame([v.split(", ") for v in data_dict.loc[
        f].loc["select_choices_or_calculations"].split(" | ")]).set_index(
            1)[0]) for f in cat_fields]))  # category options
cat_rm = list(set(pd.Series(already_expanded).explode().dropna().apply(
    lambda x: "_".join(list(np.array(x.split("_"))[:-1]))).unique(
        )))  # RM categorical fields (stripped of _#)
cat_rm = list(pd.Series([x if data_dict.loc[x + "_1"].field_type in [
    "radio", "dropdown"] else np.nan for x in cat_rm]).dropna())
cat_dict = {**cat_dict, **dict(zip(cat_rm, [
    dict(pd.DataFrame([v.split(", ") for v in data_dict.loc[
        f + "_1"].loc["select_choices_or_calculations"].split(" | ")]).set_index(
            1)[0]) for f in cat_rm]))}  # category options (RM)
print("\n".join([f"{f}: {cat_dict[f]}" for f in cat_dict]))

project: {'Ileal CD': '1', 'UC': '2', 'Zebrafish': '3', 'Mouse Organoid': '4', 'PBMC': '5', 'Perianal CD': '6', 'Ileal CD/PTGER4': '7', 'PSC-IBD': '8', 'CRISPR Screening of Monocytes': '9', 'Ileal CD gp130': '10', 'Other': '888'}
disease: {"Crohn's Disease": '1', 'Ulcerative Colitis': '2', 'Healthy Control': '3', 'Other': '888'}
disease_status: {'Active': '1', 'Inactive': '2', 'In remission': '3', 'Other': '888', 'N/A': '999'}
organism: {'Human': '1', 'Zebrafish': '2', 'Mouse': '3'}
inflam_status: {'Inflamed': '1', 'Non-inflamed': '2', 'Healthy Control': '3', 'Other': '4'}
tissue_origin: {'Terminal Ileum': '1', 'Ascending colon': '2', 'Transverse colon': '3', 'Descending colon': '4', 'Sigmoid colon': '5', 'Sigmoid-rectum': '6', 'Rectum': '7', 'Other': '888', 'N/A': '999'}
inflam_status_chronicity: {'Acute': '1', 'Chronic': '2'}
index_kit: {'Dual Index TT': '1', 'Dual Index TS': '2'}
instrument: {'NovaSeq 6000': '1', 'NextSeq 2000': '2', 'Not Sequenced': '3'}
x_chem_version_sc: {'v1': '

# Major Changes

## Standardization

In [629]:
# Setup
if any(("_old" in c for c in dff.columns)):
    raise NotImplementedError("Change code!!! '_old' is already in >= 1 original variable")
data = dff.copy()
labels_uninflamed = ["uninflamed", "noninflamed", "non-infl", "non-inflamed", "un-inflamed"]
labels_inflamed = ["inflamed"]

# Dates
for x in ["sc_process_date", "pre_amp_date", "date_sent"]:
    data.loc[:, f"{x}_old"] = data.loc[:, x].copy()
    data.loc[:, x] = data.loc[:, x].apply(
        lambda x: x if pd.isnull(x) else "/".join(x.split("-")))

# Project
data.loc[:, "project_old"] = data.loc[:, "project"].copy()
data.loc[:, "project"] = data.loc[:, "project"].apply(
    lambda x: "Zebrafish" if isinstance(
        x, str) and "zebrafish" in x.lower() else x)
data.loc[:, "project"] = data.loc[:, "project"].apply(
    lambda x: "CRISPR Screening of Monocytes" if isinstance(
        x, str) and "crispr" in x.lower() else x)
data.loc[:, "project"] = data.loc[:, "project"].apply(
    lambda x: "UC" if isinstance(x, str) and "UC_ZC_sc" in x else x)

# Project Owner ID
# concatenation of libid_ssids separated by comma, e.g., "CD01_FC01, CD02_FC02"
data = data.assign(libbbbbbbb=data.index.values)  # rename old column
data = data.join(data.apply(lambda r: f"{r['project_owner_id']}_{r['libbbbbbbb']}", 
                         axis=1).to_frame("project_owner_id_lib_id")).drop(
                             "libbbbbbbb", axis=1)  # owner_libid
data = data.join(data.groupby(
    cols_subject[0]).apply(lambda x: ",".join(
        x["project_owner_id_lib_id"])).to_frame("project_owner_id"), 
    lsuffix="_old", on=cols_subject[0])[list(data.drop(
        "project_owner_id_lib_id", axis=1).columns) + [
            "project_owner_id_old"]]  # ownder1_libid1,owner2_libid2, etc.
data = data.drop("project_owner_id_old", axis=1)

# Disease & Disease Status
data.loc[:, "disease_status_old"] = data.loc[:, "disease_status"].copy()
data.loc[:, "disease_old"] = data.loc[:, "disease"].copy()
data = data.replace({"disease": {
    "CD": "Crohn's Disease", "UC": "Ulcerative Colitis", 
    "Healthy Mice": "Healthy Control", 
    "Healthy": "Healthy Control"}})
# cats_disease_status = extract_categories("disease_status", data_dict)
# for c in cats_disease_status:
data.loc[:, "disease_status"] = data["disease_status"].apply(
    lambda x: "In remission" if "rem" in str(
        x).lower() else x)  # standardize "in remission" variants
data.loc[:, "disease_status"] = data["disease_status"].apply(
    lambda x: "Active" if "active" in str(
        x).lower() else x)  # standardize "in remission" variants
# data.loc[:, "disease"] = data.apply(lambda x:  if (pd.isnull(
#     x["disease"])) & (x["organism"] != "Human") else x)  # "other" for fish
data.loc[:, "disease"] = data.disease.apply(lambda x: "Healthy Control" if any(
    ("healthy" in str(x).lower() for i in ["healthy", "hc", "control"])) else x)

# Inflammation Status
data.loc[:, "inflam_status_old"] = data.loc[:, "inflam_status"].copy()
data.loc[:, "inflam_status_new"] = data["inflam_status"].apply(
    lambda x: np.nan if pd.isnull(x) else str("Non-inflamed" if any(
        (p in str(x).strip("-").lower() for p in labels_uninflamed)) else str(
            "Inflamed" if any(
        (q in str(x).strip("-").lower() for q in labels_inflamed)) else np.nan)))
data.loc[:, "inflam_status_chronicity"] = data["inflam_status"].apply(
    lambda x: np.nan if pd.isnull(x) else str("Acute" if "acute" in x.lower(
        ) else str("Chronic" if "chronic" in x.lower() else np.nan)))
data.loc[:, "inflam_status"] = data.loc[:, "inflam_status_new"]
data = data.drop("inflam_status_new", axis=1)
data = data.drop("inflam_status_old", axis=1)
data.loc[(pd.isnull(data.inflam_status)) & (  # HCs NAs for inflam_status
    data.disease == "Healthy Control"), "inflam_status"] = "Healthy Control"

# Chemistry Version
data.loc[:, "x_chem_version_sc_old"] = data["x_chem_version_sc"].copy()
data.loc[:, "x_chem_version_sc"] = data["x_chem_version_sc"].apply(
    lambda x: "NovaSeq 6000" if "nova" in str(x).lower() else x)
# print(data[["x_chem_version_sc_old", "x_chem_version_sc"]])

# Index Kit
data.loc[:, "index_kit_old"] = data["index_kit"].copy()
data.loc[:, "index_kit"] = data["index_kit"].apply(
    lambda x: "Dual Index TT" if "-tt-" in str(
        x).lower() else np.nan if pd.isnull(x) else "Dual Index TS")
# print(data[["index_kit_old", "index_kit"]])

# Instrument
data.loc[:, "instrument_old"] = data["instrument"].copy()
data.loc[:, "instrument"] = data.instrument.apply(
    lambda x: np.nan if pd.isnull(x) else 
    "NovaSeq 6000" if "novaseq" in x.lower() else x)
data.loc[:, "novaseq"] = data.instrument_old.apply(
    lambda x: np.nan if pd.isnull(x) or "novaseq" not in x.lower() else 
    "SP" if "sp" in x.lower() else "S4" if "s4" in x.lower() else 
    "S2" if "s2" in x.lower() else "S1" if "s1" in x.lower() else np.nan)
data.loc[:, "nextseq"] = data.instrument_old.apply(
    lambda x: np.nan if pd.isnull(x) or "novaseq" in x.lower() else 
    "P1" if "p1" in x.lower() else "p2" if "p2" in x.lower() else 
    "p3" if "p3" in x.lower() else np.nan)
# Later rename to novaseq_6000 and nextseq_2000
# Otherwise, weirdness in trying to extract nextseq_2000_2, etc.

# Tissue Origin
data.loc[:, "tissue_origin_old"] = data["tissue_origin"].copy()
tiss_orig = data["tissue_origin"]
data = data.replace({"tissue_origin": {
    "Terminal Ileal Resection": "terminal ileum resection"}})
# cats_tissue = extract_categories("tissue_origin_1", data_dict)
cats_tissue = {"Terminal Ileum": ["ileum", "ileal"],
               "Ascending colon": ["ascending colon"], 
               "Transverse colon": ["transverse colon", "transverse"], 
               "Descending colon": ["descending colon"], 
               "Sigmoid colon": ["sigmoid colon", "sigmoid"], 
               "Sigmoid-rectum": ["sigmoid-rectum"], 
               "Rectum": ["rectum", "rectal"], 
               "Other": ["fistula"]}
data.loc[:, "tissue_origin"] = data.tissue_origin.apply(
    lambda x: np.nan if pd.isnull(x) else list(
        pd.Series([c if any([u.lower() in x.lower() 
                             for u in cats_tissue[c]]) else np.nan 
                   for c in cats_tissue]).dropna())).apply(
                       lambda y: y if not isinstance(y, list) else np.nan if len(
                           y) == 0 else "WARNING" if len(y) > 1 else y[0])
if any(data.loc[:, "tissue_origin"].isin(["WARNING"])):
    raise ValueError("WARNING in tissue_origin")
print(data[["tissue_origin_old", "tissue_origin"]])

# Chemistry Version
data.loc[:, "x_chem_version_sc_old"] = data["x_chem_version_sc"].copy()
data["x_chem_version_sc"] = data["x_chem_version_sc"].apply(
    lambda x: np.nan if pd.isnull(x) else 
    f"v{x}" if "v" not in str(x) else x)  # put "v" in front of version #
data["x_chem_version_sc"] = data["x_chem_version_sc"].apply(
    lambda x: np.nan if pd.isnull(x) else 
    re.sub("[.]0", "", x))  # remove ".0" from version #s


# Print Conversions
print(f"\n\n\n{'=' * 80}\n\nConversions\n\n{'=' * 80}\n\n")
changed_variables = list(np.array(data.columns)[
    np.where(["_old" in c for c in data.columns])[0]])
for y in [re.sub("_old", "", c) for c in changed_variables]:
    print(f"\n\n*** {y}")
    conv = data.apply(lambda x: str(x[f"{y}_old"]) + " -> " + str(x[y]) if str(x[
        y]).lower() != str(x[f"{y}_old"]).lower() else np.nan, axis=1).dropna().unique()
    print("\n".join(list(conv)))

# Dropped Variables
drop_variables = []
for i in dff.columns:
    fields = search_fields("^" + i, data_dict, header=False, print_output=False)
    if len(fields) == 0:
        drop_variables += [i]
print(f"\n\n\nDROP VARIABLES (not in REDCap):\n\n{drop_variables}")
data = data.drop(drop_variables, axis=1).drop(changed_variables, axis=1)

                                                         tissue_origin_old  \
lib_id                                                                       
HH0001                                                                PBMC   
J00002     Larval intestinal dissection; Tyto sorting for GFP+ lymphocytes   
J00003     Larval intestinal dissection; Tyto sorting for GFP+ lymphocytes   
J00004     Larval intestinal dissection; Tyto sorting for GFP+ lymphocytes   
J00005     Larval intestinal dissection; Tyto sorting for GFP+ lymphocytes   
LC0001                                                                PBMC   
LC0002                                                                PBMC   
LC0003                                                                PBMC   
LC0004                                                                PBMC   
LC0005                                                                PBMC   
LC0006                                                          

In [630]:
data[data.loc[:, "tissue_origin"].isin(["WARNING"])]

Unnamed: 0_level_0,record_id,organism,project,disease,disease_status,grid,patient_id,project_owner_id,animal_line,sc_process_date,type_of_experiment,x_chem_version_sc,standard_sample_id,inflam_status,tissue_origin,no_live_cells,cell_viability_percentage,targ_cell,no_live_nuclei,no_nuclei,index_kit,pre_amp_date,date_sent,instrument,inflam_status_chronicity,novaseq,nextseq
lib_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1


## Categories

Display remaining entries that don't conform to available REDCap categories for applicable fields.

In [631]:
# cat_mistakes = data[cat_fields].apply(lambda x: x.apply(
#     lambda y: y if y not in cat_dict[
#         x.name].keys() else np.nan).dropna()).stack().to_frame("Entry").join(
#             data[cat_fields].apply(lambda x: x.apply(
#                 lambda y: ", ".join(cat_dict[x.name].keys()) if y not in cat_dict[
#                     x.name].keys() else np.nan).dropna()).stack().to_frame(
#                         "Categories")).reorder_levels([1, 0]).sort_index()
# cat_mistakes = cat_mistakes.rename_axis(["Field", cat_mistakes.index.names[1]])
# # cat_mistakes = cat_mistakes.reset_index(1, drop=True).drop_duplicates().set_index(
# #     "Entry", append=True).join(cat_mistakes.reset_index().groupby(
# #         ["Field", "Entry"]).apply(
# #             lambda x: ", ".join(x[cat_mistakes.index.names[1]])).to_frame(
# #                 cat_mistakes.index.names[1]), rsuffix="_r")  # concatenate lib_ids
# cat_mistakes = cat_mistakes.reset_index(1, drop=True).drop_duplicates().set_index(
#     "Entry", append=True).join(cat_mistakes.reset_index().groupby(
#         ["Field", "Entry"]).apply(
#             lambda x: ", ".join(x[cat_mistakes.index.names[1]])),
#         rsuffix="_r")  # concatenate lib_ids
# cat_mistakes

## Conclusion

In [632]:
ixs = [np.nan, np.nan]
for x, i in enumerate(["instrument", "index_kit"]):
    nab = data[pd.isnull(data[i])][[i]]  # all NAs
    ixs[x] = pd.Series(nab.index.values).to_frame(
        nab.index.names[0]).assign(missing=i).set_index(
            nab.index.names[0]).missing
ixs = pd.concat(ixs, keys=["instrument", "index_kit"], axis=1)
print(ixs)
print(ixs.index.values)

        instrument  index_kit
lib_id                       
SN0003  instrument  index_kit
SN0004  instrument  index_kit
NH0001  instrument  index_kit
NH0002  instrument  index_kit
NH0003  instrument  index_kit
NH0004  instrument  index_kit
NH0005  instrument  index_kit
NH0006  instrument  index_kit
NH0007  instrument  index_kit
NH0008  instrument  index_kit
NH0009  instrument  index_kit
NH0010  instrument  index_kit
NH0011  instrument  index_kit
NH0012  instrument  index_kit
NH0013  instrument  index_kit
NH0014  instrument  index_kit
NH0015  instrument  index_kit
NH0016  instrument  index_kit
QQ0001  instrument  index_kit
QQ0002  instrument  index_kit
QQ0003  instrument  index_kit
QQ0004  instrument  index_kit
QQ0005  instrument  index_kit
QQ0006  instrument  index_kit
QQ0007  instrument  index_kit
QQ0008  instrument  index_kit
QQ0009  instrument  index_kit
QQ0010  instrument  index_kit
QQ0011  instrument  index_kit
QQ0012  instrument  index_kit
QQ0013  instrument  index_kit
QQ0014  in

In [633]:
# Missingness
missing_bad = {"grid": data.organism == "Human",
               "patient_id": data.organism == "Human",
               "animal_line": data.organism != "Human",
               "disease": data.organism == "Human",
               "disease_status": data.apply(
                   lambda x: (x["organism"].lower() == "human") and (
                       "healthy" not in x["disease"].lower()), axis=1)}
missingness = data.apply(lambda x: sum(pd.isnull(x)))
nawah = dict()
for i in missingness[missingness > 0].index.values:
    if i in missing_bad:
        nab = data[pd.isnull(data[i]) & (missing_bad[i])][list(
                missing_bad.keys())]  # only if NAs unexpected
    else:
        nab = data[pd.isnull(data[i])][[i]]  # all NAs
    nawah.update({i: nab})
    if nab.shape[0] > 0:
        print(f"\n\n{i}\n\n--\n\n{', '.join(nab.index.values)}\n\n")
        print(investigate_fields(i, data_rc=data_dict, 
                                data_meta=dff, pattern=True))
na_counts = "\n".join([f"{i}: {nawah[i].shape[0]}" for i in nawah])
print(f"\n\n{'=' * 80}\n\nMissingness\n\n{'=' * 80}\n\n{na_counts}")



tissue_origin

--

HH0001, J00002, J00003, J00004, J00005, LC0001, LC0002, LC0003, LC0004, LC0005, LC0006, SN0001, SN0002, SN0003, SN0004, SN0005, SN0006, SN0007, SN0008, SN0009, SN0010, SN0011, SN0012, SN0014, NH0001, NH0002, NH0003, NH0004, NH0005, NH0006, NH0007, NH0008, QQ0003, QQ0006, QQ0009, QQ0014, QQ0021, QQ0023, QQ0027, QQ0030, QQ0033, QQ0034, QQ0035, QQ0036, QQ0037, QQ0038, QQ0039, QQ0040, QQ0041, QQ0042, QQ0043, QQ0044, QQ0045, QQ0046, QQ0047, QQ0048, QQ0049, QQ0050, QQ0051, QQ0052, QQ0053, QQ0054, QQ0055, QQ0056, QQ0057, QQ0058, QQ0059


False
tissue_origin




tissue_origin



Changing column tissue_origin to partial match: tissue_origin_1
field_name                          tissue_origin_1
form_name         sample_information_3_gex_multiome
section_header                                     
field_type                                 dropdown
field_label           Tissue Origin (if applicable)
Name: tissue_origin_1, dtype: object

***************************************

## Construction

Repeated Measures Transformation: Google Data to Wide Format

In [634]:
# Dataframe for Transformation
col_rm = "sample"
data_new = data.groupby(cols_subject[0], group_keys=True).apply(
    lambda x: x.assign(sample=np.arange(1, len(x) + 1))).reset_index(
        data.index.names).set_index(
        "sample", append=True)  # set uniform 1:n "measurement" column for RMs
if cols_subject[0] in data_new.columns:
    data_new = data_new.drop(cols_subject[0], axis=1)
data_new.head()

# Fields Overlapping between Google & REDCap
fields_overlap = [data_dict.index.values[np.where([
    re.search("^" + i + "(_[0-9]+)", x) 
    for x in data_dict.index.values])[0]] for i in data_new.columns]
fields_overlap = pd.Series(fields_overlap, index=pd.Index(
    data_new.columns, name="Google")).explode().dropna().to_frame("REDCap")
# print(fields_overlap)

# Repeated Measures Fields
fields_rm_num = fields_overlap.groupby("Google").apply(
    lambda x: pd.Series(list(x["REDCap"].str.strip(
        f"{x.name}_").astype(int).reset_index(0, drop=True)), 
                        index=pd.Index(x["REDCap"], name="REDCap")).to_frame(
            "Number") if len(x["REDCap"]) > 1 else np.nan)  # measurement #
google_rm_num = data_new.groupby(cols_subject[0]).apply(lambda x: x.apply(
    lambda y: len(y.unique()))).stack()  # Google sheet: # unique values / subject
google_rm_num = google_rm_num[google_rm_num > 1]  # within-subject-varying only
google_rm_num.index.names = [cols_subject[0], "Field"]
cols_needed = google_rm_num.reset_index(0, drop=True).groupby("Field").apply(
    max)  # maximum # unique values / Google column = # REDCap RM categories needed
fields_rm_num.Number.groupby("Google").max().min()
rm_num = fields_rm_num.Number.groupby("Google").max().to_frame("REDCap").join(
    cols_needed.to_frame("Google"))  # # of REDCap RM categories vs. # needed
if any(rm_num.Google > rm_num.REDCap):
    print(rm_num[rm_num.Google < rm_num.REDCap])
    raise ValueError("Code to add more REDCap repeated measures fields required!")
# cols_needs_fields = list(set(pd.unique(
#     google_rm_num.reset_index(0).index.values)).intersection(
#         data_dict.field_name))  # Google RM columns that need fields in RC
# print(f"{'=' * 80}\n\n\nNeed new RC RM fields:\n\n{cols_needs_fields}\n\n")

# Long to Wide Format
data_wide = data_new.reset_index()[pd.unique([cols_subject[0], col_rm] + list(
    rm_num.index.values))].reset_index()
# data_wide["column_name"] = data_wide[col_rm].astype(str)
data_wide = data_wide.astype(object).pivot(
    index=cols_subject[0], columns=col_rm, values=rm_num.index.values)
data_wide.columns = [f"{c[0]}_{c[1]}" for c in data_wide.columns]
repeat_fin = data_new.drop(rm_num.index.values, axis=1).groupby(
    cols_subject[0]).apply(
    lambda s: s.apply(lambda x: len(x.unique()) > 1)).stack()
if repeat_fin.any():
    print(repeat_fin[repeat_fin].reset_index(1)["level_1"].unique())
    raise ValueError("Columns varying w/i-subject retained in data_new!")
data_new = data_wide.join(data_new.drop(rm_num.index.values, axis=1).groupby(
    cols_subject[0]).apply(lambda s: s.apply(
        lambda x: x.unique()[0])))  # join bt- & wi-subject columns (wide)
data_new = data_new.replace({"nan": np.nan})
data_new = data_new.assign(redcap_event_name=redcap_event_name)
for i in data_dict.index.difference(data_new.columns):
    data_new[i] = np.nan
data_new = data_new[["redcap_event_name"] + list(
    set(data_dict.index.values[1:]))]

# type_of_experiment
data_new = data_new.join(data_new[[f"type_of_experiment_{x}" for x in range(
    1, int(max_rm) + 1)]].apply(lambda x: False if all(x.isnull()) else any(
        (not pd.isnull(i) and str(i).lower() != "scrna" for i in x)
        ), axis=1).replace(False, 0).replace(True, 1).to_frame(
            "type_of_experiment___atac"))
data_new = data_new.join(data_new[[f"type_of_experiment_{x}" for x in range(
    1, int(max_rm) + 1)]].apply(lambda x: False if all(x.isnull()) else any(
        (not pd.isnull(i) and str(i).lower() == "scrna" for i in x)
        ), axis=1).replace(False, 0).replace(True, 1).to_frame(
            "type_of_experiment___3prime"))

# Renaming
data_new = data_new.rename(dict(zip([f"type_of_experiment_{i}" for i in range(1, int(max_rm) + 1)], [f"experiment_{i}" for i in range(1, int(max_rm) + 1)])), axis=1)
data_new = data_new.rename(dict([(f"nextseq_{i}", f"nextseq_2000_{i}") for i in range(1, int(max_rm) + 1)]), axis=1)
data_new = data_new.rename(dict([(f"novaseq_{i}", f"novaseq_6000_{i}") for i in range(1, int(max_rm) + 1)]), axis=1)
data_new = data_new.rename(dict([("nextseq_{i}", f"nextseq_2000_{i}") for i in range(1, int(max_rm) + 1)]), axis=1)

# Numbers of Cells/Nuclei
for x in ["no_live_cells", "no_live_nuclei", "no_nuclei", "targ_cell"]:
    for i in range(1, int(max_rm) + 1):
        # data_new = data_new.astype({f"{x}_{i}": "Int64"})
        data_new[f"{x}_{i}"] = data_new[f"{x}_{i}"].astype(object)
        if any(("Est" in str(i) for i in data_new.loc[:, f"{x}_{i}"])):
            for q in data_new.loc[:, f"{x}_{i}"].index.values:
                if "Est" in str(data_new.loc[q, f"{x}_{i}"]):
                    thou = "K" in str(data_new.loc[q, f"{x}_{i}"])
                    print(data_new.loc[q, f"{x}_{i}"])
                    data_new.loc[q, f"{x}_{i}"] = re.sub("Est: ", "", re.sub(
                        "K", "", data_new.loc[q, f"{x}_{i}"]))
                    if thou:
                        data_new.loc[q, f"{x}_{i}"] = float(data_new.loc[q, f"{x}_{i}"]) * 1000
                    print(data_new.loc[q, f"{x}_{i}"], "\n\n")
        data_new.loc[:, f"{x}_{i}"] = data_new.loc[:, f"{x}_{i}"].replace("No load", 0)
        data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
            "", np.nan).astype(float).astype("Int64")
        # data_new.loc[:, f"{x}_{i}"] = data_new.loc[:, f"{x}_{i}"].replace(
        #     np.nan, "").apply(lambda x: x if x == "" else re.sub(
        #         ".0", "", str(x))).astype(str)
    
data_new = data_new.dropna(how="all", axis=1)

# Save New Data & Dictionary
# data_dict.to_csv("data_dictionary_new.csv", na_rep="")
data_new.to_csv("data_new.csv", na_rep="")
data_new.head()

  data_wide = data_wide.astype(object).pivot(
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[i] = np.nan
  data_new[

Est: 497.5K
497500.0 


Est: 213.5K
213500.0 


Est: 497.5K
497500.0 


Est: 213.5K
213500.0 




  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_new.loc[:, f"{x}_{i}"] = data_new[f"{x}_{i}"].replace(
  data_n

Unnamed: 0_level_0,redcap_event_name,pre_amp_date_5,no_nuclei_3,index_kit_3,instrument_3,instrument_5,standard_sample_id_6,targ_cell_2,targ_cell_3,experiment_6,x_chem_version_sc_5,date_sent_1,sc_process_date_1,inflam_status_6,standard_sample_id_3,lib_id_2,standard_sample_id_4,experiment_2,targ_cell_5,no_live_nuclei_1,inflam_status_3,no_live_nuclei_2,disease,sc_process_date_6,inflam_status_chronicity_2,...,cell_viability_percentage_1,cell_viability_percentage_4,sc_process_date_2,pre_amp_date_2,inflam_status_2,pre_amp_date_1,lib_id_1,sc_process_date_3,no_live_cells_2,tissue_origin_3,experiment_5,tissue_origin_2,cell_viability_percentage_5,cell_viability_percentage_6,patient_id,lib_id_4,x_chem_version_sc_1,date_sent_3,no_live_cells_6,no_live_cells_5,standard_sample_id_1,instrument_6,no_nuclei_4,type_of_experiment___atac,type_of_experiment___3prime
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
FC_Hu_AA,forms_arm_1,,,,,,,,,,,,08/01/2020,,,,,,,,,,Healthy Control,,,...,,,,,,,QQ0033,,,,,,,,AA,,v3.1,,,,FC_Hu_AA_PBMC,,,1,0
FC_Hu_EA,forms_arm_1,,,,,,,,,,,,08/01/2020,,,,,,,,,,Healthy Control,,,...,,,,,,,QQ0034,,,,,,,,EA,,v3.1,,,,FC_Hu_EA_PBMC,,,1,0
FC_Hu_IL1,forms_arm_1,,,,,,,,,,,,10/01/2020,,,QQ0032,,CITE-seq,,,,,Crohn's Disease,,,...,,,10/01/2020,,Non-inflamed,,QQ0031,,,,,Terminal Ileum,,,IL1,,v3.1,,,,FC_Hu_IL1_Inf,,,1,0
FC_Hu_IL2,forms_arm_1,,,,,,,5000.0,,,,04/08/2021,03/01/2021,,,CD0002,,CITE-seq,,,,,Crohn's Disease,,,...,65.2,,03/01/2021,04/01/2021,Non-inflamed,04/01/2021,CD0001,,100000.0,,,Terminal Ileum,,,IL2,,v3.1,,,,FC_Hu_IL2_Inf,,,1,0
FC_Hu_IL3,forms_arm_1,,,,,,,5000.0,,,,04/08/2021,03/01/2021,,,CD0004,,CITE-seq,,,,,Crohn's Disease,,,...,69.2,,03/01/2021,04/01/2021,Non-inflamed,04/01/2021,CD0003,,296000.0,,,Terminal Ileum,,,IL3,,v3.1,,,,FC_Hu_IL3_Inf,,,1,0


### Data Dictionary

In [635]:
new_field_stems = ["type_of_experiment", "date_sent", "pre_amp_date", 
                   "instrument", "sc_process_date", "sample", 
                   "novaseq_6000", "nextseq_2000"
                   #, 
                #    "inflam_status_chronicity", "x_chem_version_sc", 
                #    "lib_id", "index_kit"
                   ]
# new_field_stems = pd.unique(["_".join(f.split("_")[:-1]) for f in fields])
drc = pd.read_csv(dd_dict_csv, index_col=0).rename_axis(
    "Variable / Field Name")
for x in ["instrument", "novaseq_6000", "nextseq_2000"]: 
    max_choice = int(drc.loc[x]["Choices, Calculations, OR Slider Labels"].split(" | ")[-1].split(", ")[0]) + 1
    drc.loc[x, "Choices, Calculations, OR Slider Labels"] = drc.loc[x]["Choices, Calculations, OR Slider Labels"] + f" | {max_choice}, Not Sequenced"

for i in range(1, int(max_rm) + 1):
    drc.loc[f"cell_viability_percentage_{i}", "Text Validation Type OR Show Slider Number"] = "number"
    drc.loc[f"cell_viability_percentage_{i}", "Text Validation Max"] = 100
    drc.loc[f"cell_viability_percentage_{i}", "Text Validation Min"] = 0
    drc.loc[f"cell_viability_percentage_{i}", "Field Type"] = "text"

rows = []
for i in drc.index.values:
    if i in new_field_stems:
        r_s = [drc.loc[[i]]] if i in ["type_of_experiment", "instrument"] else []
        keys = [i] if i in ["type_of_experiment"] else []
        
        # RMs
        for q in range(1, int(max_rm) + 1):
            ddd = drc.loc[[i]]
            if any((s in str(ddd["Branching Logic (Show field only if...)"]) for s in new_field_stems)):
                for s in set(new_field_stems).difference(["type_of_experiment"]):
                    if s in str(ddd["Branching Logic (Show field only if...)"]):
                        ddd.loc[i, "Branching Logic (Show field only if...)"] = re.sub(s, f"{s}_{q}", str(ddd["Branching Logic (Show field only if...)"].loc[i]))
            ddd.loc[i, "Field Label"] = f"{ddd.loc[i, 'Field Label']} {q}"
            if i == "type_of_experiment": 
                ddd.loc[i, "Choices, Calculations, OR Slider Labels"] = "1, scRNA | 2, Multiome - Gene Expression | 3, Multiome - ATAC | 5, CITE-seq | 888, Other, please specify"
                ddd.loc[i, "Field Type"] = "dropdown"
                keys += [f"experiment_{q}"]
            else:
                keys += [f"{i}_{q}"]
            r_s += [ddd.drop(drc.index.names[0], axis=1) if drc.index.names[0] in ddd else ddd]
            
        # Concatenate RMs & Add to List of Rows
        r_s = pd.concat(r_s, keys=keys, names=list(
            drc.index.names)).reset_index(1, drop=True)
        rows += [r_s.loc[[i]] for i in r_s.index.values]
    else:
        if "index_kit" in i: 
            print(drc.loc[[i]])
        rows += [drc.loc[[i]]]
new = pd.concat(rows, names=list(drc.index.names)).replace(
    "nan", "").replace(np.nan, "").apply(lambda y: y.apply(
        lambda x: re.sub(".0", "", str(x)
                         ) if "Validation M" in y.name and x != "" else x))
if drc.index.names[0] in new.columns:
    new = new.drop(drc.index.names[0], axis=1)
# new = new.rename(dict(zip([f"type_of_experiment_{i}" for i in range(1, int(max_rm) + 1)], [f"experiment_{i}" for i in range(1, int(max_rm) + 1)])), axis=1)
new = new.reset_index().drop_duplicates().set_index(new.index.names)

new.to_csv("data_dictionary_new.csv", na_rep="", index=True, 
           index_label=drc.index.names[0])
new.head()

                                         Form Name Section Header Field Type  \
Variable / Field Name                                                          
index_kit_1            library_prep_3_gex_multiome            NaN      radio   

                      Field Label Choices, Calculations, OR Slider Labels  \
Variable / Field Name                                                       
index_kit_1             Index Kit     1, Dual Index TT | 2, Dual Index TS   

                      Field Note Text Validation Type OR Show Slider Number  \
Variable / Field Name                                                         
index_kit_1                  NaN                                        NaN   

                       Text Validation Min  Text Validation Max  Identifier?  \
Variable / Field Name                                                          
index_kit_1                            NaN                  NaN          NaN   

                      Branching Logic (Show fiel

                                         Form Name Section Header Field Type  \
Variable / Field Name                                                          
index_kit_scm          library_prep_3_gex_multiome            NaN      radio   

                      Field Label  \
Variable / Field Name               
index_kit_scm           Index Kit   

                                             Choices, Calculations, OR Slider Labels  \
Variable / Field Name                                                                  
index_kit_scm          1, Dual Index TT | 2, Dual Index TS | 3, Single Index Plate N   

                      Field Note Text Validation Type OR Show Slider Number  \
Variable / Field Name                                                         
index_kit_scm                NaN                                        NaN   

                       Text Validation Min  Text Validation Max  Identifier?  \
Variable / Field Name                                           

Unnamed: 0_level_0,Form Name,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,Identifier?,Branching Logic (Show field only if...),Required Field?,Custom Alignment,Question Number (surveys only),Matrix Group Name,Matrix Ranking?,Field Annotation
Variable / Field Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
record_id,sample_information_3_gex_multiome,,text,Record ID,,Standardized subject ID (or Sequencing Batch ID if applicable). Please follow the convention: SN_Ze_KO1,,,,,,,,,,,
lib_id_1,sample_information_3_gex_multiome,,text,"ID UNIQUE TO THE SAMPLE AND MEASUREMENT. This should not ever be repeated, even for the same subject, or a different run of the same sample.",,,,,,,,y,,,,,
lib_id_2,sample_information_3_gex_multiome,,text,,,,,,,,,,,,,,
lib_id_3,sample_information_3_gex_multiome,,text,,,,,,,,,,,,,,
lib_id_4,sample_information_3_gex_multiome,,text,,,,,,,,,,,,,,


# Notes

## Updates

### August 2

- Code to do the following has been completed and tested

      * Modify data dictionary to include new repeated measures fields (e.g., "tissue_origin_1," "tissue_origin_2,"...)
      * Pivot Google sheets data to wide format
      * Detect additional potential issues preventing transformation
      
- We'll need to allow NA categories for repeated measures fields b/c not everyone will have max (~8) samples

      * But for some of those, we won't want to allow missingness AS LONG AS any variable is recorded for that measurement occasion
      * How to enforce? Check for _# on every other variable or something?

### July 31

#### Missingness

- We still have pretty substantial missingness on several important categorical variables. (These numbers exclude missingness where entries should be NA (e.g., disease_status for healthy controls), at least as far as I remembered the applicable conditions.)
    * disease_status (55)
    * IDs (18 grid and 5 patient_id)
    * instrument (87)
- We also have missingness on date variables (pre_amp_date, date_sent, sc_process_date) and some non-categorical variables (no_live_cells, cell_viability_percentage, targ_cell, no_live_nuclei, no_nuclei); these missing values may or may not be appropriate (I haven’t investigate) and regardless won't necessarily break the transition since we can usually just put NAs for non-categorical fields, but I wanted to note it for completeness.

#### Inflammation Status Categories
I’ve sent Judy an email regarding the inflammation status entries that don’t fit under current inflamm_status REDCap categories (e.g., DSS-related ones).

## Meetings

### Chris

Take out leading 00 in libid except for QQ samples

Consult Judy about inflam_status "treatment" categories (e.g., DSS)

Others must fill in NAs for certain things

Allow NAs for disease_status since QQXXXX lib_ids won't ever have data for it?

Issues from Today’s Work @Chris Tastad 
We created a "Healthy mice" category for "disease," but I failed to notice branching logic only lets it fill in for humans. Change branching, or remove Healthy mice category?

Blanks (b/c respecting branching logic) vs. NAs -- some mixed up in Google sheet, so need to fix there or programmatically by using extracted branching logic?


### (Email) Ksenija

Once we clean the metadata, @Ksenija will make changes related to using drop-down boxes for past free-entry/”other” specifiers so people don’t repeat random variations of free-entry choices.

RE #1(a) below (“inflam_status”): 

Ksenija will standardize, e.g., collapse '2X DSS + 10uM BZA' and 2X DSS --> 10uM BZA'  into one category.

We should get Judy's input on what to keep (1X vs 2X).

Some info can be redirected to notes_sample_prime_1-8.

#1(c): Ksenija will talk to Felix about “inflam_status” and “disease_status.”

#3 (index_kit)

Ksenija will fill in missing values on the Google sheets metadatabase.

I will create an NA category. Also, Ksenija noticed a "Single Index Plate N" category, but only under Multiome-ATAC.

#6 (instrument): Ksenija will fill in missing values for Martin et al. data.


### Ksenija, Chris, and Elizabeth (07/14/2023)

Inflammation status ("inflam_status" REDCap field)
      (a) Didn't discuss because Ksenija answered by email, but I'm realizing I'm still not entirely certain how these should be mapped given the choices of 1, Inflamed | 2, Non-inflamed | 3, Healthy Control | 4, Other, please specify
   	Should these all be "other?"
   	- '350nM hGM-CSF'

'0.25% DSS 1X'

'0.25% DSS 1X + 350nM hGM-CSF'

'2X 0.075% DSS'

'2X DSS + 10uM BZA'

'2X DSS --> 10uM BZA'

'0.075% DSS 1X'

'0.075% DSS 2X'

'z'
(b) See #5
      (c) Felix needs to fix

Tissue type specification ("tissue_origin")
      (a) and (b) I will make larval intestinal dissection category
      (c) N/A for blood and other non-tissue

Kit ("index_kit")
      (a) XX-TT-XX = TT; XX-NA -> new category that I’ll create; TS is for fixed, but not included in metadatabase
      (b) Ksenija will fill and track people down; I will create category for NA

Ksenija will fill in entries of "pending"

Missing values for "pre_amp_date"
      (a) They should be blank because not multiome
      (b) Keep

Instrument specifications
      (a) "nova_seq_6000" field should be fed the flow cell info contained in "instrument"
      (b) Martin et al. we may not know, but Felix, Rachel, Shika, maybe others have missing and shouldn’t. Senija will track people down and/or fix

x_chem_version_sc: Some of these should be missing, but others shouldn’t, so Chris changed those — done 

Some people improperly treated project_owner_id as sample ID rather than subject ID — should be same across different samples within-subject

We need it to be the same across record IDs, or we’ll have to create project_owner_1, etc. fields

project_owner_id: concatenation of libid_ssids separated by comma, e.g., "CD0001_FC01, CD0002_FC02"

standard_sample_id: Replicates across samples fine for different types of experiments — I'll check programmatically that this is true

Rachel’s samples where libid = AA1inf and AA1non: Chris isn’t sure these even are the libids…don’t rename, just have to deal with it

Implement drop-down of past specifiers?

### (Email) Original

Inflammation status ("inflam_status" REDCap field)

      (a) For consistency, I'd like to be able to convert all entries to the available REDCap categories of "uninflamed" or "inflamed." There are other values entered in the meta-database, such as "untreated." Some appear to be for non-human models and/or manipulations: "Untreated," "no treatment," "TNF-a treatment," and "DSS treated." (See the post-script at the end of this email for more values.) It does seem that these happen entirely or almost entirely for mouse and zebrafish samples. Let me know how I should code these (e.g., does "DSS treated" mean DSS-induced colitis, and should that be coded as UC, other, or should we create animal model categories)?
      (b) Some entries also have other additional information (e.g., fistula presence) that can only be entered if the "other" option is selected for "status_inflam." Is there another variable under which people can store such additional key information if "status_inflam" is not "other?" Should we have a category for things like "fistula"?
      (c) Should "PBMC" entries be re-coded as NA?

Tissue type specification ("tissue_origin")
      (a) How should we specify the larval tissue type or the animal intestinal dissection for the samples?
      (b) Should we alter the "tissue_origin" variable to have a category for animal tissue? The relevant values in the meta-database are "larval intestinal dissection; Tyto sorting for GFP+ lymphocytes" and "larval intestinal dissection."
      (c) What about "whole blood?"
      
Kit ("index_kit")
      (a) Entries in the meta-database take the form XX-TT-XX or XX-NA-XX (with varying letters/numbers in place of XX). Can you confirm that entries with "TT" correspond to "Dual Index TT?" What about NA? Do they correspond with "Dual Index TS?"
      (b) There are also a lot of missing values.
      
"Pending" entries: Could you clarify the meaning and implications of entries of "pending" for a couple variables (one observation, I think)?

Disease status ("disease_status")
      (a) We have come across an entry labeled as "left side" under the "disease_status" variable. Does this indicate that the disease is active specifically on the left side? Please advise.
      (b) Similar to in (1), there is extra information included (e.g., proctosigmoiditis, fistula, stenosis). Should we just ignore the extra information, create extra fields, or find a pre-existing specifier field under which to include this information?
      (c) We also appear to have some missing values for these for non-healthy control human subjects.
      (d) Also, for the "disease" field, should we translate "healthy mice" into the "healthy control" or "other" category?

Missing values for "pre_amp_date"
      (a) Should all blank values be converted to NA?
      (b) Are we dropping this variable? It's a field in REDCap, but Chris mentioned we may not need it as it's documented elsewhere and perhaps not of prime significance.

Instrument specifications
      (a) Currently, we classify any instrument as "Nova-Seq" or "Next-Seq." We would like to confirm if you prefer to change the categorization from instrument type (Nova Seq vs. Next Seq) to the flow cell used.
      (b) Are the NA values in the database true missing values, or should these be re-coded as "Next-Seq?" There are no other entries that seem to correspond to "NextSeq" in the meta-database.
      

Other missing values:
      - x_chem_version_sc
      - index_kit
      
Thank you in advance for your help!

Best,
Elizabeth

More "inflam_status" values:

'350nM hGM-CSF'

'0.25% DSS 1X'

'0.25% DSS 1X + 350nM hGM-CSF'

'2X 0.075% DSS'

'2X DSS + 10uM BZA'

'2X DSS --> 10uM BZA'

'0.075% DSS 1X'

'0.075% DSS 2X'

'z'

# Transfer

## Prepare

In [636]:
# Options
unique_id = "lib_id"
path_file = "data_new.csv"
path_config = os.path.join(os.path.expanduser("~"), 
                           ".ssh/config_redcap.json")
cols_should_be_unique = ["standard_sample_id"]
project = "Cho Lab Single Cell Sample Metadatabase"
remove_leading_zeros_lib_id = True
overwriteBehavior = "normal"  # so blank doesn't overwrite filled

# Load Google Sheets Database
dff = data_new
# dff = pd.read_csv(path_file)
if remove_leading_zeros_lib_id:
    for i in range(1, 9):
        if f"lib_id_{i}" in dff.columns:
            dff[f"lib_id_{i}"] = dff[f"lib_id_{i}"].apply(
                lambda x: x if pd.isnull(x) else x.lstrip(
                    "00"))  # remove leading zeros from unique ID

# Load Data Dictionary
with open(path_config, "r") as json_file:
    config = json.load(json_file)
api_url, token = config[project]["url"], config[project]["token"]
drc = get_redcap_metadata(project, api_url, token)
# data_dict = pd.concat([pd.Series(x, name=x["field_name"]) 
#                        for x in drc["data_dictionary"]], axis=1).T


# # Convert Categorical Values to REDCap Codes
# for f in dff.columns:
#     print(f)
#     if f in data_dict.index.values and data_dict.loc[f].field_type in [
#         "radio", "dropdown"]:
#             cats = data_dict.loc[f].loc[
#                 "select_choices_or_calculations"].split(" | ")
#             dff = dff.drop(f, axis=1).join(
#                 dff[f].apply(lambda x: x if pd.isnull(x) else dict(
#                     pd.DataFrame([v.split(", ") for v in cats]).set_index(1)[0])[x]))

data_dict = new
col = "Choices, Calculations, OR Slider Labels"

# Convert Categorical Values to REDCap Codes
for f in dff.columns:
    print(f)
    if f in data_dict.index.values and data_dict.loc[f]["Field Type"] in [
        "radio", "dropdown"]:
            cats = data_dict.loc[f].loc[col].split(" | ")
            dff = dff.drop(f, axis=1).join(
                dff[f].apply(lambda x: x if pd.isnull(x) else dict(
                    pd.DataFrame([v.split(", ") for v in cats]).set_index(1)[0])[x]))

HTTP Status: 200
HTTP Status: 200
redcap_event_name
pre_amp_date_5
no_nuclei_3
index_kit_3
instrument_3
instrument_5
standard_sample_id_6
targ_cell_2
targ_cell_3
experiment_6
x_chem_version_sc_5
date_sent_1
sc_process_date_1
inflam_status_6
standard_sample_id_3
lib_id_2
standard_sample_id_4
experiment_2
targ_cell_5
no_live_nuclei_1
inflam_status_3
no_live_nuclei_2
disease
sc_process_date_6
inflam_status_chronicity_2
no_live_cells_1
inflam_status_chronicity_1
lib_id_6
experiment_3
no_live_cells_3
tissue_origin_1
index_kit_6
inflam_status_4
organism
x_chem_version_sc_4
lib_id_3
instrument_1
inflam_status_chronicity_3
no_nuclei_6
tissue_origin_5
cell_viability_percentage_3
project
standard_sample_id_2
no_live_nuclei_3
x_chem_version_sc_2
x_chem_version_sc_6
pre_amp_date_4
instrument_4
date_sent_5
index_kit_4
tissue_origin_4
targ_cell_6
sc_process_date_4
no_live_nuclei_4
lib_id_5
index_kit_1
disease_status
no_nuclei_5
targ_cell_4
grid
inflam_status_5
index_kit_2
animal_line
no_nuclei_2
sc_

## Categorized Data New

In [657]:
form_cols = []
arms_dict = {"sample_information_3_gex_multiome": "forms_arm_1", 
             "sequencing": "forms_arm_2", 
             "sequencing_qc": "forms_arm_3"}
for x in new["Form Name"].unique():
    d_n = dff[new[new["Form Name"] == x].index.intersection(dff.columns)]
    if x in arms_dict:
        d_n.loc[:, "redcap_event_name"] = arms_dict[x]
        d_n.to_csv(f"data_new_{arms_dict[x]}.csv", na_rep="", index_label=dff.index.names[0])
        form_cols += list(d_n.columns)
    # else:
        # print(f"{'=' * 80}\n\n{x}\n{d_n.columns}")
        
print(dff.columns.difference(form_cols))
d_n = dff[dff.columns.difference(form_cols)]
d_n.loc[:, "redcap_event_name"] = "forms_arm_1"
d_n.to_csv(f"data_new_other.csv", na_rep="", index_label=dff.index.names[0])

Index(['index_kit_1', 'index_kit_2', 'index_kit_3', 'index_kit_4',
       'index_kit_5', 'index_kit_6', 'pre_amp_date_1', 'pre_amp_date_2',
       'pre_amp_date_3', 'pre_amp_date_4', 'pre_amp_date_5', 'pre_amp_date_6',
       'type_of_experiment___3prime', 'type_of_experiment___atac'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_n.loc[:, "redcap_event_name"] = arms_dict[x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_n.loc[:, "redcap_event_name"] = arms_dict[x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_n.loc[:, "redcap_event_name"] = arms_dict[x]
A value is trying to be set on a copy of a slice from a DataFram

In [641]:
dff.to_csv("data_new.csv", index_label="record_id")