## Basic Bulk Template - manual annotation

this is an incredibly basic bulk annotation template for manual annotation

In [60]:
## manual every time 
experiment_id = "SRP158448"

# path to output should always be BULK, this will create folder
path_to_output  = "/Users/anneniknejad/BULK/{}/".format(experiment_id)
import os
if not os.path.exists(path_to_output):
    os.makedirs(path_to_output)

## create exp and git annotations
path_to_create_exp_script = "/Users/anneniknejad/Bgee/scripts/scripts/Create_ExpLib_tables.py" ## could also make this path to scripts but kinda unnecessary for bulk
path_to_git_annotations = "/Users/anneniknejad/expression-annotations/RNA_Seq/"

## constant paths
experiment_type = "bulk"
library_path  = "{}RNASeqLibrary_{}.tsv".format(path_to_output, experiment_id)
experiment_path = "{}RNASeqExperiment_{}.tsv".format(path_to_output, experiment_id)
git_library_path = "{}RNASeqLibrary.tsv".format(path_to_git_annotations)
git_experiment_path = "{}RNASeqExperiment.tsv".format(path_to_git_annotations)

commit_message_exp = '"adding annotated bulk experiment {}"'.format(experiment_id)

library_cols = ['#libraryId', 'experimentId', 'platform', 'SRSId', 'anatId', 'anatName', 'stageId', 'stageName', 'url_GSM', 'infoOrgan', 'infoStage', 'anatAnnotationStatus', 'anatBiologicalStatus', 'stageAnnotationStatus', 'sex', 'strain', 'genotype', 'speciesId', 'protocol', 'protocolType', 'RNASelection', 'globin_reduction', 'replicate', 'lib_name', 'sampleName', 'sampleAge_value', 'sampleAge_unit', 'PATOid', 'PATOname','comment', 'condition', 'physiologicalStatus', 'annotatorId', 'lastModificationDate']
#path_to_output = "/Users/anneniknejad/BULK/testhack/" 

In [61]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import os
import csv
pd.set_option('display.max_columns', 500)

# displays df with the scrollbar next to the DataFrame
def display_df(df):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(HTML("<div style='height: 300px; overflow: auto; width: fit-content'>" +
        df.style.to_html(index=False) + "</div>"))

# fixes formatting of file to match libreoffice settings/historic file format
def update_format(path):
    with open(path, 'r') as file:
        filedata = file.read()
    # Replace the target string
    filedata = filedata.replace("\t\"\"", "\t")
    # Write the file out again
    with open(path, 'w') as file:
        file.write(filedata)

# checks for duplicate values in a specific column and prints those values + the corresponding library id
def dup_check(df, column):
    duplicateCheck = df.duplicated(subset=[column], keep=False)
    duplicateCheck.sort_values(inplace=True)
    if duplicateCheck.unique().any() == False:
        print("no duplicate values in " + column)
    elif duplicateCheck.unique().any() == True and column != '#libraryId':
        dups = df[duplicateCheck].loc[:,['#libraryId', column]]
        df_dups = pd.DataFrame(dups)
        df_dups.sort_values(inplace=True, by=column)
        print(df_dups)
    elif duplicateCheck.unique().any() == True and column == '#libraryId':
        print(df[duplicateCheck].loc[:,['#libraryId']])

### script - create experiment/library files 

In [62]:
! python3 $path_to_create_exp_script $experiment_id $path_to_output $experiment_type

Be patient, it may take a few minutes.
06-Nov-2024 11:45:05 DEBUG utils - Directory ./ already exists. Skipping.
06-Nov-2024 11:45:05 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE157nnn/GSE157044/soft/GSE157044_family.soft.gz to ./GSE157044_family.soft.gz
100%|██████████████████████████████████████| 5.03k/5.03k [00:00<00:00, 9.23kB/s]
06-Nov-2024 11:45:07 DEBUG downloader - Size validation passed
06-Nov-2024 11:45:07 DEBUG downloader - Moving /var/folders/b5/fvxprl_95kd1hckf52s4qhlr0000gp/T/tmph_mmddyc to /Users/anneniknejad/expression-annotations/Notebooks/bulk/GSE157044_family.soft.gz
06-Nov-2024 11:45:07 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE157nnn/GSE157044/soft/GSE157044_family.soft.gz
06-Nov-2024 11:45:07 INFO GEOparse - Parsing ./GSE157044_family.soft.gz: 
06-Nov-2024 11:45:07 DEBUG GEOparse - DATABASE: GeoMiame
06-Nov-2024 11:45:07 DEBUG GEOparse - SERIES: GSE157044
06-Nov-2024 11:45:07 DEBUG GEOparse - PLATF

### can add automation to annotation here

this is where i could write code to make things easier but to discuss details

### links for annotation
- [species specific developmental ontologies](https://github.com/obophenotype/developmental-stage-ontologies/tree/master/src)
- uniprot [strain list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/strains)
- uniprot [species list](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist)
- bgee [strain mapping](https://gitlab.sib.swiss/Bgee/expression-annotations/-/tree/develop/Strains?ref_type=heads)
- [bulk.cvs file](https://gitlab.sib.swiss/Bgee/scRNA-Seq/-/blob/main/scripts/bulk_kits.csv)
- [taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy)

### library file CV
- sex options: M (male), F (female), NA (not available, unknown), mixed (both male and female)
- protocolType option: full_length, 3'
- RNASelection option: polyA, ribo-minus, miRNA, circRNA


#### check for duplicate SRSids

In [74]:
# check for duplicate SRSId values (replicates)
library = pd.read_csv(library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
dup_check(library, "SRSId")

no duplicate values in SRSId


### STOP - manual annotation time

### annotation complete - check files and QA

In [63]:
library_file = pd.read_csv(library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
experiment_to_add = pd.read_csv(experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
# choose columns from library file that are needed in main file
library_to_add = library_file[library_cols]

In [64]:
display_df(library_to_add)

Unnamed: 0,#libraryId,experimentId,platform,SRSId,anatId,anatName,stageId,stageName,url_GSM,infoOrgan,infoStage,anatAnnotationStatus,anatBiologicalStatus,stageAnnotationStatus,sex,strain,genotype,speciesId,protocol,protocolType,RNASelection,globin_reduction,replicate,lib_name,sampleName,sampleAge_value,sampleAge_unit,PATOid,PATOname,comment,condition,physiologicalStatus,annotatorId,lastModificationDate
0,SRX4581413,SRP158448,HiSeq X Ten,SRS3695739,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_1,"SAMN09840123,GSM4751460",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
1,SRX4581416,SRP158448,HiSeq X Ten,SRS3695744,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,F,Landrace,,9823,,full_length,ribo-minus,,,LD0_2,"SAMN09840124,GSM4751461",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
2,SRX4581415,SRP158448,HiSeq X Ten,SRS3695742,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_3,"SAMN09840125,GSM4751462",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
3,SRX4581364,SRP158448,HiSeq X Ten,SRS3695691,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_1,"SAMN09840144,GSM4751481",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
4,SRX4581401,SRP158448,HiSeq X Ten,SRS3695728,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_2,"SAMN09840145,GSM4751482",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
5,SRX4581400,SRP158448,HiSeq X Ten,SRS3695727,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_3,"SAMN09840146,GSM4751483",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
6,SRX4581399,SRP158448,HiSeq X Ten,SRS3695726,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_1,"SAMN09840147,GSM4751484",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
7,SRX4581365,SRP158448,HiSeq X Ten,SRS3695692,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_2,"SAMN09840148,GSM4751485",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
8,SRX4581366,SRP158448,HiSeq X Ten,SRS3695693,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_3,"SAMN09840149,GSM4751486",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6
9,SRX4581367,SRP158448,HiSeq X Ten,SRS3695694,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000053,20-week-old stage,,"Skeletal muscle, longissimus dorsi",D140,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD140_1,"SAMN09840150,GSM4751487",140,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,,2024-11-6


In [68]:
library_to_add.loc[:,'annotatorId'] = 'ANN'
library_to_add.loc[:,'lastModificationDate'] = '2024-11-06'
library_to_add.to_csv(library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

In [70]:
display_df(library_to_add)

Unnamed: 0,#libraryId,experimentId,platform,SRSId,anatId,anatName,stageId,stageName,url_GSM,infoOrgan,infoStage,anatAnnotationStatus,anatBiologicalStatus,stageAnnotationStatus,sex,strain,genotype,speciesId,protocol,protocolType,RNASelection,globin_reduction,replicate,lib_name,sampleName,sampleAge_value,sampleAge_unit,PATOid,PATOname,comment,condition,physiologicalStatus,annotatorId,lastModificationDate
0,SRX4581413,SRP158448,HiSeq X Ten,SRS3695739,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_1,"SAMN09840123,GSM4751460",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
1,SRX4581416,SRP158448,HiSeq X Ten,SRS3695744,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,F,Landrace,,9823,,full_length,ribo-minus,,,LD0_2,"SAMN09840124,GSM4751461",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
2,SRX4581415,SRP158448,HiSeq X Ten,SRS3695742,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_3,"SAMN09840125,GSM4751462",0,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
3,SRX4581364,SRP158448,HiSeq X Ten,SRS3695691,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_1,"SAMN09840144,GSM4751481",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
4,SRX4581401,SRP158448,HiSeq X Ten,SRS3695728,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_2,"SAMN09840145,GSM4751482",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
5,SRX4581400,SRP158448,HiSeq X Ten,SRS3695727,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_3,"SAMN09840146,GSM4751483",100,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
6,SRX4581399,SRP158448,HiSeq X Ten,SRS3695726,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_1,"SAMN09840147,GSM4751484",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
7,SRX4581365,SRP158448,HiSeq X Ten,SRS3695692,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_2,"SAMN09840148,GSM4751485",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
8,SRX4581366,SRP158448,HiSeq X Ten,SRS3695693,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000050,17-week-old stage,,"Skeletal muscle, longissimus dorsi",D120,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD120_3,"SAMN09840149,GSM4751486",120,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06
9,SRX4581367,SRP158448,HiSeq X Ten,SRS3695694,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000053,20-week-old stage,,"Skeletal muscle, longissimus dorsi",D140,perfect match,not documented,missing child term,M,Landrace,,9823,,full_length,ribo-minus,,,LD140_1,"SAMN09840150,GSM4751487",140,day,,,"PMID:33434283, The skeletal muscle (longissimus dorsi) samples were collected from Landrace pigs at 27 developmental stages, including embryonic days 33, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 and 105 (abbreviated as E33, E40, E45, E50, E55, E60, E65, E70, E75, E80, E85, E90, E95, E100 and E105) and postnatal days 0, 9, 20, 30, 40, 60, 80, 100, 120, 140, 160 and 180 (abbreviated as D0, D9, D20, D30, D40, D60, D80, D100, D120, D140, D160 and D180)",,,ANN,2024-11-06


#### more QA can be added here - this is minimum to check (columns match, view the file that will be created)

In [71]:
! git pull

Already up to date.


In [39]:
git_library = pd.read_csv(git_library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
git_experiment = pd.read_csv(git_experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)

In [72]:
# library file
if set(library_to_add.columns) == set(git_library.columns):
    print('The columns in the library file to append match the columns in the main library file')
else:
    print('The columns in the library file to append DO NOT MATCH the columns in the main library file')

# experiment file
if set(experiment_to_add.columns) == set(git_experiment.columns):
    print('The columns in the experiment file to append match the columns in the main experiment file')
else:
    print('The columns in the experiment file to append DO NOT MATCH the columns in the main experiment file')


# maybe to make this something more like "COLUMNS GOOD - LIBRARY" and "COLUMNS BAD - EXPERIMENT"

The columns in the library file to append match the columns in the main library file
The columns in the experiment file to append match the columns in the main experiment file


#### check files before adding

In [75]:
library_git_plus_new = pd.concat([git_library, library_to_add], ignore_index = True, sort = False)
old_length = git_library.shape[0]
start = old_length - 2
end = old_length + 5
view_lib = library_git_plus_new.iloc[start:end]
view_lib

Unnamed: 0,#libraryId,experimentId,platform,SRSId,anatId,anatName,stageId,stageName,url_GSM,infoOrgan,infoStage,anatAnnotationStatus,anatBiologicalStatus,stageAnnotationStatus,sex,strain,genotype,speciesId,protocol,protocolType,RNASelection,globin_reduction,replicate,lib_name,sampleName,sampleAge_value,sampleAge_unit,PATOid,PATOname,comment,condition,physiologicalStatus,annotatorId,lastModificationDate
42494,SRX8098088,SRP256022,Illumina HiSeq X Ten,SRS6464579,UBERON:0002190,subcutaneous adipose tissue,SscrDv:0000005,juvenile stage,,Subcutaneous Adipose Tissue,juvenile,perfect match,not documented,perfect match,F,Large White,,9823,NEBNext Ultra RNA library Prep Kit,full_length,polyA,,,Y30-2SA-Rep2,SAMN14401436,30,day,,,PMID: 33825319 [Bgee curator notes: not same i...,,,ANN,2024-11-05
42495,SRX8098089,SRP256022,Illumina HiSeq X Ten,SRS6464580,UBERON:0002190,subcutaneous adipose tissue,SscrDv:0000005,juvenile stage,,Subcutaneous Adipose Tissue,juvenile,perfect match,not documented,perfect match,F,Large White,,9823,NEBNext Ultra RNA library Prep Kit,full_length,polyA,,,Y30-3SA-Rep3,SAMN14401437,30,day,,,PMID: 33825319 [Bgee curator notes: not same i...,,,ANN,2024-11-05
42496,SRX4581413,SRP158448,HiSeq X Ten,SRS3695739,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_1,"SAMN09840123,GSM4751460",0,day,,,"PMID:33434283, The skeletal muscle (longissimu...",,,ANN,2024-11-06
42497,SRX4581416,SRP158448,HiSeq X Ten,SRS3695744,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,F,Landrace,,9823,,full_length,ribo-minus,,,LD0_2,"SAMN09840124,GSM4751461",0,day,,,"PMID:33434283, The skeletal muscle (longissimu...",,,ANN,2024-11-06
42498,SRX4581415,SRP158448,HiSeq X Ten,SRS3695742,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000072,0-day-old stage,,"Skeletal muscle, longissimus dorsi",D0,perfect match,not documented,perfect match,M,Landrace,,9823,,full_length,ribo-minus,,,LD0_3,"SAMN09840125,GSM4751462",0,day,,,"PMID:33434283, The skeletal muscle (longissimu...",,,ANN,2024-11-06
42499,SRX4581364,SRP158448,HiSeq X Ten,SRS3695691,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_1,"SAMN09840144,GSM4751481",100,day,,,"PMID:33434283, The skeletal muscle (longissimu...",,,ANN,2024-11-06
42500,SRX4581401,SRP158448,HiSeq X Ten,SRS3695728,UBERON:0001401,longissimus thoracis muscle,SscrDv:0000046,14-week-old stage,,"Skeletal muscle, longissimus dorsi",D100,perfect match,not documented,missing child term,F,Landrace,,9823,,full_length,ribo-minus,,,LD100_2,"SAMN09840145,GSM4751482",100,day,,,"PMID:33434283, The skeletal muscle (longissimu...",,,ANN,2024-11-06


In [None]:
experiment_git_plus_new = pd.concat([git_experiment, experiment_to_add], ignore_index = True, sort = False)
experiment_git_plus_new.tail(3)

### add to git

#### stop here - if you make manual changes to your file you need to start again at check new annotations - the next few steps will add to github

In [76]:
! git pull

Already up to date.


### DON'T FORGET EXPERIMENT FILE BEFORE RUNNING NEXT CELL

In [77]:
library_git_plus_new.to_csv(git_library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
experiment_git_plus_new.to_csv(git_experiment_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_library_path)
update_format(git_experiment_path)

In [83]:
! git status

On branch develop
Your branch is up to date with 'origin/develop'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   ../../RNA_Seq/RNASeqExperiment.tsv[m
	[32mmodified:   ../../RNA_Seq/RNASeqLibrary.tsv[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mGSE157044_family.soft.gz[m



In [79]:
! git add $path_to_git_annotations

In [80]:
experiment_id

'SRP158448'

In [84]:
# manual message
! git commit -m "annotation of bulk pig experiment SRP158448"

[develop 8052c6c] annotation of bulk pig experiment SRP158448
 2 files changed, 82 insertions(+), 7 deletions(-)


In [85]:
! git push

Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 4 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 3.99 KiB | 8.00 KiB/s, done.
Total 5 (delta 4), reused 0 (delta 0)
remote: 
remote: 
remote:    Hello everyone, in order to improve security, we will permanently[K
remote:    enable the 2FA feature for all GitLab accounts starting in January[K
remote:       2025. We kindly ask you to enable 2FA on your GitLab account[K
remote:      yourself as soon as possible before we proceed with the global[K
remote:                        activation (instructions:[K
remote:   https://www.youtube.com/watch?v=YWeGgGLjhqk&t=142s). We rely on you[K
remote:        not to wait for the global activation. Thank you for your[K
remote:              cooperation. For any questions, please contact[K
remote:                          it-support@sib.swiss.[K
remote: 
remote: 
remote: 
remote: To create a merge request for develop, vis