## title

**paper:** []() - 

**date, curator:** 2025-0, Sara Carsanaro

**resources**

**notes**

### annotation summary
run these when annotation is complete

In [None]:
# cell type annotations
display_df(cell_df_complete)

In [None]:
# organ annotations
display_df(organ_df_complete)

In [None]:
# stage annotations
display_df(stage_df_complete)

### set variables, import packages, define functions

In [19]:
experiment_id = ""

path_to_create_exp_script = "/Users/scarsana/Desktop/git/scRNA-Seq/scripts/Create_ExpLib_tables.py" 
path_to_barcodes_script = "/Users/scarsana/Desktop/git/scRNA-Seq/scripts/create_scRNASeq_barcode_v2.py"
experiment_type = "scRNA"
import os


path_to_output_main = "/Users/scarsana/Desktop/git/expression-annotations/Notebooks/scRNA/" 
path_to_output = "{}{}/".format(path_to_output_main, experiment_id)

path_to_initial_folder = "{}initial/".format(path_to_output)
if not os.path.exists(path_to_initial_folder):
    os.makedirs(path_to_initial_folder)

library_path_from_script = "{}scRNASeqLibrary_{}.tsv".format(path_to_initial_folder, experiment_id)
experiment_path_from_script = "{}scRNASeqExperiment_{}.tsv".format(path_to_initial_folder, experiment_id)
barcode_path = "{}scRNASeq_barcode_{}.tsv".format(path_to_initial_folder, experiment_id)


path_to_complete_folder = "{}complete/".format(path_to_output)
if not os.path.exists(path_to_complete_folder):
    os.makedirs(path_to_complete_folder)

barcode_to_add_path = "{}complete_scRNASeq_barcode_{}.tsv".format(path_to_complete_folder, experiment_id)
library_to_add_path = "{}complete_scRNASeqLibrary_{}.tsv".format(path_to_complete_folder, experiment_id)
experiment_to_add_path = "{}complete_RNASeqExperiment_{}.tsv".format(path_to_complete_folder, experiment_id)
script_file = "{}.ipynb".format(experiment_id)
commit_message_exp = '"adding annotated scRNA experiment {}"'.format(experiment_id)
commit_message_py = '"adding annotation files for {} to notebook folder"'.format(experiment_id)


## to add to git
path_to_git_annotations = "/Users/scarsana/Desktop/git/expression-annotations/scRNA_Seq/"
git_library_path = "{}scRNASeqLibrary_merged.tsv".format(path_to_git_annotations)
git_experiment_path = "{}scRNASeqExperiment.tsv".format(path_to_git_annotations)
git_barcode_path = "{}scRNASeq_barcode_{}.tsv".format(path_to_git_annotations, experiment_id)

## columns
library_cols = ['#libraryId', 'experimentId', 'platform', 'SRSId', 'anatId', 'anatName', 'cellTypeId', 'cellTypeName', 'stageId', 'stageName', 'url_GSM', 'infoOrgan', 'infoCellType_abInitio', 'infoCellType_inferred', 'clusterId', 'clusterName', 'infoStage', 'anatAnnotationStatus', 'cellTypeAnnotationStatus', 'stageAnnotationStatus', 'sex', 'strain', 'genotype', 'speciesId', 'RNAseqTags', 'protocol', 'protocolType', 'lib_name', 'sampleName', 'comment', 'condition', 'annotatorId', 'lastModificationDate']
barcode_cols = ['barcode', 'cluster', 'library', 'experiment', 'tissue', 'cell_type', 'anatId_a_posteriori', 'anatName_a_posteriori', 'anat_a_posteriori_annotationStatus', 'cellTypeId', 'cellTypeName', 'cellTypeAnnotationStatus', 'name_Library', 'comments']

## annotation sheet paths (maybe make these into their own folder)
annotation_files_path = "{}annotation_files/".format(path_to_output)

if not os.path.exists(annotation_files_path):
    os.makedirs(annotation_files_path)

info_anat_path = "{}info_anat.tsv".format(annotation_files_path)
info_anat_complete_path = "{}info_anat_complete.tsv".format(annotation_files_path)
info_stage_path = "{}info_dev_stage.tsv".format(annotation_files_path)
info_stage_complete_path = "{}info_dev_stage_complete.tsv".format(annotation_files_path)
info_celltype_path = "{}info_celltype.tsv".format(annotation_files_path)
info_celltype_complete_path = "{}info_celltype_complete.tsv".format(annotation_files_path)

## barcode file & matching library names - 10X only
path_to_barcode_file = "{}barcode_{}.tsv".format(path_to_output, experiment_id)
lib_names_path = "{}matching_lib_names_{}.tsv".format(path_to_initial_folder, experiment_id)



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import csv
import scanpy as sc

# displays df with the scrollbar next to the DataFrame
def display_df(df):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
        df.style.to_html(index=False) + "</div>"))

# function that compares two columns in a dataframe and tells you which ones are not equal (case insensitive)
def compare_columns(df, col1, col2, return_col):
    compare_return = df[col1].str.lower() != df[col2].str.lower()  
    df.loc[compare_return, return_col] 
    if not any(compare_return):
        print("The two columns are equal (case insensitive)")
    else:
        print("The following rows are not equal: ")
        print(df.loc[compare_return, [return_col, col1, col2]])

# fixes formatting of file to match libreoffice settings/historic file format
def update_format(path):
    with open(path, 'r') as file:
        filedata = file.read()
    # Replace the target string
    filedata = filedata.replace("\t\"\"", "\t")
    # Write the file out again
    with open(path, 'w') as file:
        file.write(filedata)

# checks for duplicate values in a specific column and prints those values + the corresponding library id
def dup_check(df, column):
    duplicateCheck = df.duplicated(subset=[column], keep=False)
    duplicateCheck.sort_values(inplace=True)
    if duplicateCheck.unique().any() == False:
        print("no duplicate values in " + column)
    elif duplicateCheck.unique().any() == True and column != '#libraryId':
        dups = df[duplicateCheck].loc[:,['#libraryId', column]]
        df_dups = pd.DataFrame(dups)
        df_dups.sort_values(inplace=True, by=column)
        print(df_dups)
    elif duplicateCheck.unique().any() == True and column == '#libraryId':
        print(df[duplicateCheck].loc[:,['#libraryId']])

# prints all unique values in a specific column
def unique_sorted(df, column):
    unique = df[column].unique()
    unique.sort()
    print(unique)

### 10X - barcodes and annotations
before annotating a 10X experiment it is important to ensure that the barcodes and cell type annotations are available. places to check include GEO, supplemental materials of the paper, email the authors, github for project

below is code for h5ad, h5, and csv files. rds files should be accessed in R with readRDS 

In [None]:
# h5ad file metadata access
adata = sc.read("")
metadata = adata.obs

In [None]:
# h5 file metadata access
h5 = h5py.File("",'r')
# most likely place for metadata is attrs
# also can use keys() instead of items to just get the names
print(list(h5.attrs.items()))
print(list(h5.items()))

In [None]:
meta = h5.attrs.get('')
meta = h5.get('')

In [None]:
# csv file metadata access

### script

In [None]:
! python3 $path_to_create_exp_script $experiment_id $path_to_initial_folder $experiment_type

### library annnotations

In [None]:
library = pd.read_csv(library_path_from_script, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(library)

#### anatomical entity
* [OLS Uberon](https://www.ebi.ac.uk/ols4/ontologies/uberon)

In [None]:
infoOrgan = library['infoOrgan'].drop_duplicates().sort_values()
organ_df = pd.DataFrame(infoOrgan)
organ_df = organ_df.reindex(columns=[*organ_df.columns.tolist(), 'notes_anat', 'anatId_', 'anatName_', 'anatAnnotationStatus_', 'infoOrgan_new', 'comment_anat'], fill_value="")
organ_df.to_csv(info_anat_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
if not os.path.exists(info_anat_complete_path): 
    organ_df.to_csv(info_anat_complete_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

# options for anatAnnotationStatus: perfect match, missing child term, other

In [None]:
organ_df_complete = pd.read_csv(info_anat_complete_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(organ_df_complete)

In [None]:
library = library.merge(organ_df_complete, left_on='infoOrgan', right_on='infoOrgan')

In [None]:
library['anatId'] = library['anatId_'].values
library['anatName'] = library['anatName_'].values
library['anatAnnotationStatus'] = library['anatAnnotationStatus_'].values

In [None]:
## optional - update infoOrgan, add anat-specific comments 
#library['infoOrgan'] = library['infoOrgan_new'].values
#library['comment'] = library[['comment', 'comment_anat']].agg(' '.join, axis=1)

In [None]:
# reformat, save progress
library = library[library_cols]
display_df(library)
library.to_csv(library_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

#### stage
- [species specific developmental ontologies](https://github.com/obophenotype/developmental-stage-ontologies/tree/master/src/ontology/components)

In [None]:
infoStage = library['infoStage'].drop_duplicates().sort_values()
stage_df = pd.DataFrame(infoStage)
stage_df = stage_df.reindex(columns=[*stage_df.columns.tolist(), 'notes_stage', 'stageId_', 'stageName_', 'stageAnnotationStatus_', 'infoStage_new','comment_stage'], fill_value="")
stage_df.to_csv(info_stage_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
if not os.path.exists(info_stage_complete_path): 
    stage_df.to_csv(info_stage_complete_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

# options for stageAnnotationStatus: perfect match, missing child term, other

In [None]:
stage_df_complete = pd.read_csv(info_stage_complete_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(stage_df_complete)

In [None]:
library = library.merge(stage_df_complete, left_on='infoStage', right_on='infoStage')

In [None]:
library['stageId'] = library['stageId_'].values
library['stageName'] = library['stageName_'].values
library['stageAnnotationStatus'] = library['stageAnnotationStatus_'].values

In [None]:
## optional - update infoOrgan, add anat-specific comments 
#library['infoStage'] = library['infoStage_new'].values
#library['comment'] = library[['comment', 'comment_stage']].agg(' '.join, axis=1)

In [None]:
# reformat, save progress
library = library[library_cols]
display_df(library)
library.to_csv(library_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

#### cell type - smart-seq
* [OLS Cell Ontology](https://www.ebi.ac.uk/ols4/ontologies/cl)

In [None]:
cell_type = library['infoCellType_abInitio'].drop_duplicates().sort_values()
cell_df = pd.DataFrame(cell_type)
cell_df = cell_df.reindex(columns=[*cell_df.columns.tolist(), 'notes_cell', 'cellTypeId_', 'cellTypeName_', 'cellTypeAnnotationStatus_', 'cell_type_new','comment_cell'], fill_value="")
cell_df.to_csv(info_celltype_path, sep="\t", index=False)
if not os.path.exists(info_celltype_complete_path): 
    cell_df.to_csv(info_celltype_complete_path, sep="\t", index=False)

In [None]:
cell_df_complete = pd.read_csv(info_celltype_complete_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(cell_df_complete)

In [None]:
library = library.merge(cell_df_complete, left_on='infoCellType_abInitio', right_on='infoCellType_abInitio')

In [None]:
library['cellTypeId'] = library['cellTypeId_'].values
library['cellTypeName'] = library['cellTypeName_'].values
library['cellTypeAnnotationStatus'] = library['cellTypeAnnotationStatus_'].values

In [None]:
## optional - update infoOrgan, add anat-specific comments 
#library['infoCellType_inferred'] = library['cell_type_new'].values
#library['comment'] = library[['comment', 'comment_cell']].agg(' '.join, axis=1)

In [None]:
# reformat, save progress
library = library[library_cols]
display_df(library)
library.to_csv(library_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

#### sex, strain, genotype, speciesId
- uniprot [strain list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/strains)
- uniprot [species list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist)
- bgee [strain mapping](https://gitlab.sib.swiss/Bgee/expression-annotations/-/tree/develop/Strains?ref_type=heads)
- sex options: M (male), F (female), NA (not available, unknown), mixed (both male and female)

In [None]:
#library.loc[library["sex"] == "male", "sex"] = "M"
#library.loc[library["sex"] == "female", "sex"] = "F"

#library.loc[:,'strain'] = ''

#library.loc[:,'genotype'] = ''

#library.loc[:,'speciesId'] = ''

# view
display_df(library)

#### protocol & RNAseqTags


In [None]:
# making these variables because we use them again in the experiment file
# usually 10X Genomics V1/V2/V3/V3.1, Smart-seq, Smart-seq2
my_protocol = ''
# Full-length or 3'end or 5'end
my_protocolType = ""

library.loc[:,'protocol'] = my_protocol
library.loc[:,'protocolType'] = my_protocolType
# scRNA-seq, Sn-scRNA-seq
library.loc[:,'RNAseqTags'] = ''

# view
display_df(library)

#### condition

In [None]:
# ex. control, diet, light, reproductive capacity, time post mortem, time post feeding, 
# exercise details, menstruation, personality, litter size 
#library.loc[library["condition"] == "old", "condition"] = "new"

# view
display_df(library)

#### annotator id, last modification date

In [None]:
library.loc[:,'annotatorId'] = 'SAC'
library.loc[:,'lastModificationDate'] = '2025-0'

# view
display_df(library)

#### comments

In [None]:
#library.loc[:,'comment'] = ''

#### save complete file with correct columns

In [None]:
library_file_complete = library[library_cols]

library_file_complete.to_csv(library_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

# view
display_df(library_file_complete)

### script - barcode (10X only)

In [None]:
barcode_column = "barcode"
library_column = "library_name"
cell_type_column = "clusters"

In [None]:
! python3 $path_to_barcodes_script $path_to_initial_folder --scRNASeqLibrary $library_to_add_path --barcode_file $path_to_barcode_file --barcode_col $barcode_column --libName_col $library_column --cellType_col $cell_type_column --ignore_check --no_additional_file

#### check library names match

In [None]:
lib_names = pd.read_csv(lib_names_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(lib_names)

### cell type annotation - 10X version
* [OLS Cell Ontology](https://www.ebi.ac.uk/ols4/ontologies/cl)

In [None]:
barcode_file = pd.read_csv(barcode_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

In [None]:
cell_type = barcode_file['cell_type'].drop_duplicates().sort_values()
cell_df = pd.DataFrame(cell_type)
cell_df = cell_df.reindex(columns=[*cell_df.columns.tolist(), 'notes_cell', 'cellTypeId_', 'cellTypeName_', 'cellTypeAnnotationStatus_', 'cell_type_new','comment_cell'], fill_value="")
cell_df.to_csv(info_celltype_path, sep="\t", index=False)
if not os.path.exists(info_celltype_complete_path): 
    cell_df.to_csv(info_celltype_complete_path, sep="\t", index=False)

# options for cellTypeAnnotationStatus: perfect match, missing child term, other

In [None]:
cell_df_complete = pd.read_csv(info_celltype_complete_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(cell_df_complete)

In [None]:
barcode_file = barcode_file.merge(cell_df_complete, left_on='cell_type', right_on='cell_type')

In [None]:
barcode_file['cellTypeId'] = barcode_file['cellTypeId_'].values
barcode_file['cellTypeName'] = barcode_file['cellTypeName_'].values
barcode_file['cellTypeAnnotationStatus'] = barcode_file['cellTypeAnnotationStatus_'].values

In [None]:
# optional - update cell type name and add comments
#barcode_file['cell_type'] = barcode_file['cell_type_new'].values
#barcode_file['comments'] = barcode_file[['comment_cell']]

In [None]:
barcode_file_complete = barcode_file[barcode_cols]
display_df(barcode_file_complete)
barcode_file_complete.to_csv(barcode_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

### experiment annotations

In [None]:
experiment = pd.read_csv(experiment_path_from_script, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
experiment.head()

#### experiment and protocol details

In [None]:
library_to_add = pd.read_csv(library_to_add_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

# this will give you the number of rows in the complete library file 
# this should be the number of annotated libraries
ann_lib = len(library_to_add.index)
len(library_to_add.index)

In [None]:
# partial or total
experiment.loc[:,'experimentStatus'] = ''
# scRNA-seq, Sn-scRNA-seq for both we put: 'Sn-scRNA-seq, scRNA-seq'
experiment.loc[:,'RNAseqTags'] = ''


# see above cell, also can add as free text
experiment.loc[:,'numberOfAnnotatedLibraries'] = ann_lib

# these variables should already exist from above but if not can just add as free text
experiment.loc[:,'protocol'] = my_protocol
experiment.loc[:,'protocolType'] = my_protocolType

experiment.head()

#### paper and xrefs

In [None]:
#experiment.loc[:,'GSE'] = ''
#experiment.loc[:,'Bioproject'] = '' 
experiment.loc[:,'PMID'] = ''
experiment.loc[:,'reference_url'] = ''
experiment.loc[:,'DOI'] = ''
#experiment.loc[:,'xrefs'] = ''

experiment.head()

#### comments

In [None]:
#experiment.loc[:,'comment'] = ''

experiment.head()

#### save complete file

In [None]:
experiment.to_csv(experiment_to_add_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

### QA time

In [None]:
library_to_add = pd.read_csv(library_to_add_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
experiment_to_add = pd.read_csv(experiment_to_add_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

#### to add things here
* add check that experimentDescription in experiment file isn't blank or NA
* add check that all anat, celltype, stage columns are filled out
    * celltype can be blank if 10X experiment but should have a flag for that
* add check that all annotation status columns are filled out

#### check columns match

In [None]:
# pull from git and pull in library/experiment file
! git pull
git_library = pd.read_csv(git_library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
git_experiment = pd.read_csv(git_experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

# library file
if set(library_to_add.columns) == set(git_library.columns):
    print('The columns in the library file match')
else:
    print('The columns in the library file DO NOT MATCH')

# experiment file
if set(experiment_to_add.columns) == set(git_experiment.columns):
    print('The columns in the experiment file match')
else:
    print('The columns in the experiment file DO NOT MATCH')


# maybe to make this something more like "COLUMNS GOOD - LIBRARY" and "COLUMNS BAD - EXPERIMENT"

#### view files

In [None]:
library_git_plus_new = pd.concat([git_library, library_to_add], ignore_index = True, sort = False)
old_length = git_library.shape[0]
start = old_length - 2
end = old_length + 5
view_lib = library_git_plus_new.iloc[start:end]
view_lib

In [None]:
experiment_git_plus_new = pd.concat([git_experiment, experiment_to_add], ignore_index = True, sort = False)
experiment_git_plus_new.tail(n=3)

In [None]:
# 10X only
barcode_file_for_git = pd.read_csv(barcode_to_add_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
barcode_file_for_git.head(10)

### add annotations to git

In [None]:
! git pull

In [None]:
library_git_plus_new.to_csv(git_library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_library_path)
experiment_git_plus_new.to_csv(git_experiment_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_experiment_path)

In [None]:
# 10X only
barcode_file_for_git.to_csv(git_barcode_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_barcode_path)

In [None]:
! git status

In [None]:
# 10X (barcode file)
! git add $git_experiment_path $git_library_path $git_barcode_path 

In [None]:
# smart-seq (no barcode file)
! git add $git_experiment_path $git_library_path  

In [None]:
! git status

In [None]:
! git commit -m $commit_message_exp

In [None]:
! git push

### add annotation folder and script to git

first - run annotation summary cells, save ipynb, export to html 

In [None]:
! git status

In [None]:
! git add $path_to_output

In [None]:
! git commit -m $commit_message_py

In [None]:
! git push