## title

**paper:** []() - 

**date, curator:** 2024-0, Sara Carsanaro

**resources**

**notes**

### set variables, import packages, define functions

In [20]:
experiment_id = "SRP404481"

path_to_create_exp_script = "/Users/anneniknejad/Bgee/scripts/scripts/Create_ExpLib_tables.py" 
path_to_barcodes_script = "/Users/anneniknejad/Bgee/scripts/scripts/create_scRNASeq_barcode_v2.py"
experiment_type = "scRNA"

path_to_output  = "/Users/anneniknejad/SCRNA/{}/".format(experiment_id)
import os
if not os.path.exists(path_to_output):
    os.makedirs(path_to_output)
library_path_from_script = "{}scRNASeqLibrary_{}.tsv".format(path_to_output, experiment_id)
experiment_path_from_script = "{}scRNASeqExperiment_{}.tsv".format(path_to_output, experiment_id)
barcode_path = "{}scRNASeq_barcode_{}.tsv".format(path_to_output, experiment_id)
commit_message_exp = '"adding annotated scRNA experiment {}"'.format(experiment_id)



## to add to git
path_to_git_annotations = "/Users/anneniknejad/expression-annotations/scRNA_Seq/"
git_library_path = "{}scRNASeqLibrary_merged.tsv".format(path_to_git_annotations)
git_experiment_path = "{}scRNASeqExperiment.tsv".format(path_to_git_annotations)
git_barcode_path = "{}scRNASeq_barcode_{}.tsv".format(path_to_git_annotations, experiment_id)

## columns
library_cols = ['#libraryId', 'experimentId', 'platform', 'SRSId', 'anatId', 'anatName', 'cellTypeId', 'cellTypeName', 'stageId', 'stageName', 'url_GSM', 'infoOrgan', 'infoCellType_abInitio', 'infoCellType_inferred', 'clusterId', 'clusterName', 'infoStage', 'anatAnnotationStatus', 'cellTypeAnnotationStatus', 'stageAnnotationStatus', 'sex', 'strain', 'genotype', 'speciesId', 'RNAseqTags', 'protocol', 'protocolType', 'lib_name', 'sampleName', 'comment', 'condition', 'annotatorId', 'lastModificationDate']
barcode_cols = ['barcode', 'cluster', 'library', 'experiment', 'tissue', 'cell_type', 'anatId_a_posteriori', 'anatName_a_posteriori', 'anat_a_posteriori_annotationStatus', 'cellTypeId', 'cellTypeName', 'cellTypeAnnotationStatus', 'name_Library', 'comments']


## barcode file & matching library names - 10X only
#path_to_barcode_file = "{}barcode_{}.tsv".format(path_to_output, experiment_id)
lib_names_path = "{}matching_lib_names_{}.tsv".format(path_to_output, experiment_id)



In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import csv
import scanpy as sc

# displays df with the scrollbar next to the DataFrame
def display_df(df):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
        df.style.to_html(index=False) + "</div>"))

# function that compares two columns in a dataframe and tells you which ones are not equal (case insensitive)
def compare_columns(df, col1, col2, return_col):
    compare_return = df[col1].str.lower() != df[col2].str.lower()  
    df.loc[compare_return, return_col] 
    if not any(compare_return):
        print("The two columns are equal (case insensitive)")
    else:
        print("The following rows are not equal: ")
        print(df.loc[compare_return, return_col])

# fixes formatting of file to match libreoffice settings/historic file format
def update_format(path):
    with open(path, 'r') as file:
        filedata = file.read()
    # Replace the target string
    filedata = filedata.replace("\t\"\"", "\t")
    # Write the file out again
    with open(path, 'w') as file:
        file.write(filedata)

# checks for duplicate values in a specific column and prints those values + the corresponding library id
def dup_check(df, column):
    duplicateCheck = df.duplicated(subset=[column], keep=False)
    duplicateCheck.sort_values(inplace=True)
    if duplicateCheck.unique().any() == False:
        print("no duplicate values in " + column)
    elif duplicateCheck.unique().any() == True and column != '#libraryId':
        dups = df[duplicateCheck].loc[:,['#libraryId', column]]
        df_dups = pd.DataFrame(dups)
        df_dups.sort_values(inplace=True, by=column)
        print(df_dups)
    elif duplicateCheck.unique().any() == True and column == '#libraryId':
        print(df[duplicateCheck].loc[:,['#libraryId']])

# prints all unique values in a specific column
def unique_sorted(df, column):
    unique = df[column].unique()
    unique.sort()
    print(unique)

### script

In [None]:
! python3 $path_to_create_exp_script $experiment_id $path_to_output $experiment_type

### manual annotation time

### links for annotation
- [species specific developmental ontologies](https://github.com/obophenotype/developmental-stage-ontologies/tree/master/src)
- uniprot [strain list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/strains)
- uniprot [species list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist)
- bgee [strain mapping](https://gitlab.sib.swiss/Bgee/expression-annotations/-/tree/develop/Strains?ref_type=heads)
- [bulk.cvs file](https://gitlab.sib.swiss/Bgee/scRNA-Seq/-/blob/main/scripts/bulk_kits.csv)
- [taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy)
- sex options: M (male), F (female), NA (not available, unknown), mixed (both male and female)
- protocolType option: full_length, 3'

#### save complete file with correct columns

In [13]:
library = pd.read_csv(library_path_from_script, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

library_file_complete = library[library_cols]

library_file_complete.loc[:,'annotatorId'] = 'ANN'
library_file_complete.loc[:,'lastModificationDate'] = '2025-01-07'
library_file_complete.to_csv(library_path_from_script, sep="\t", index=False, quoting=csv.QUOTE_ALL)


### script - barcode (10X only)

In [14]:
path_to_barcode_file = "{}GSE216542_RNA_Metadata_Final.tsv".format(path_to_output)
barcode_column = "orig.ident"
library_column = "orig.ident"
cell_type_column = "SubclusterAnnotation"

In [None]:
! python3 $path_to_barcodes_script $path_to_output --scRNASeqLibrary $library_path_from_script  --barcode_file $path_to_barcode_file --barcode_col $barcode_column --libName_col $library_column --cellType_col $cell_type_column --ignore_check --no_additional_file

#### check library names match

In [None]:
lib_names = pd.read_csv(lib_names_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
display_df(lib_names)

### QA time

In [17]:
library_to_add = pd.read_csv(library_path_from_script, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
experiment_to_add = pd.read_csv(experiment_path_from_script, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

#### to add things here
* add check that experimentDescription in experiment file isn't blank or NA
* add check that all anat, celltype, stage columns are filled out
    * celltype can be blank if 10X experiment but should have a flag for that
* add check that all annotation status columns are filled out

#### check columns match

In [None]:
# pull from git and pull in library/experiment file
! git pull
git_library = pd.read_csv(git_library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
git_experiment = pd.read_csv(git_experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)

# library file
if set(library_to_add.columns) == set(git_library.columns):
    print('The columns in the library file match')
else:
    print('The columns in the library file DO NOT MATCH')

# experiment file
if set(experiment_to_add.columns) == set(git_experiment.columns):
    print('The columns in the experiment file match')
else:
    print('The columns in the experiment file DO NOT MATCH')


# maybe to make this something more like "COLUMNS GOOD - LIBRARY" and "COLUMNS BAD - EXPERIMENT"

#### view files

In [None]:
library_git_plus_new = pd.concat([git_library, library_to_add], ignore_index = True, sort = False)
old_length = git_library.shape[0]
start = old_length - 2
end = old_length + 5
view_lib = library_git_plus_new.iloc[start:end]
view_lib

In [None]:
experiment_git_plus_new = pd.concat([git_experiment, experiment_to_add], ignore_index = True, sort = False)
experiment_git_plus_new.tail(n=5)

In [None]:
# 10X only
barcode_file_for_git = pd.read_csv(barcode_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN', " "], dtype=object)
barcode_file_for_git.head(20)

### add annotations to git

In [None]:
! git pull

In [None]:
library_git_plus_new.to_csv(git_library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_library_path)
experiment_git_plus_new.to_csv(git_experiment_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_experiment_path)

In [None]:
# 10X only - adding barcode file to git
barcode_file_for_git.to_csv(git_barcode_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_barcode_path)

In [None]:
! git status

In [None]:
# 10X (barcode file)
! git add $git_experiment_path $git_library_path $git_barcode_path 

In [None]:
# smart-seq (no barcode file)
! git add $git_experiment_path $git_library_path  

In [None]:
! git status

In [None]:
! git commit -m $commit_message_exp

In [None]:
! git push