## Basic Bulk Template - manual annotation

this is an incredibly basic bulk annotation template. 

automated parts:
* running create experiment script
* adding new annotations (for library and experiment file) to master files in git
* pushing to git

manual parts:
* all annotation!!!

more auotmated parts can be added if you want but i thought we could start here

In [None]:
# examples: SRP427903 or SRP432863
## manual every time (although could somewhat automated path_to_output)
experiment_id = ""
path_to_output = "" ## could set this as a default folder and append the experiment_id
## YYYY-MM-DD
my_date = "2024-"


## set once
path_to_create_exp_script = "" ## could also make this path to scripts but kinda unnecessary for bulk
path_to_git_annotations = ""

## always the same
experiment_type = "bulk"
library_path  = "{}RNASeqLibrary_{}.tsv".format(path_to_output, experiment_id)
experiment_path = "{}RNASeqExperiment_{}.tsv".format(path_to_output, experiment_id)
git_library_path = "{}RNASeqLibrary.tsv".format(path_to_git_annotations)
git_experiment_path = "{}RNASeqExperiment.tsv".format(path_to_git_annotations)

commit_message_exp = '"adding annotated bulk experiment {}"'.format(experiment_id)

library_cols = ['#libraryId', 'experimentId', 'platform', 'SRSId', 'anatId', 'anatName', 'stageId', 'stageName', 'url_GSM', 'infoOrgan', 'infoStage', 'anatAnnotationStatus', 'anatBiologicalStatus', 'stageAnnotationStatus', 'sex', 'strain', 'genotype', 'speciesId', 'protocol', 'protocolType', 'RNASelection', 'globin_reduction', 'replicate', 'lib_name', 'sampleName', 'sampleAge_value', 'sampleAge_unit', 'PATOid', 'PATOname','comment', 'condition', 'physiologicalStatus', 'annotatorId', 'lastModificationDate']


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import os
import csv
pd.set_option('display.max_columns', 500)

# displays df with the scrollbar next to the DataFrame
def display_df(df):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(HTML("<div style='height: 300px; overflow: auto; width: fit-content'>" +
        df.style.to_html(index=False) + "</div>"))

# fixes formatting of file to match libreoffice settings/historic file format
def update_format(path):
    with open(path, 'r') as file:
        filedata = file.read()
    # Replace the target string
    filedata = filedata.replace("\t\"\"", "\t")
    # Write the file out again
    with open(path, 'w') as file:
        file.write(filedata)

# checks for duplicate values in a specific column and prints those values + the corresponding library id
def dup_check(df, column):
    duplicateCheck = df.duplicated(subset=[column], keep=False)
    duplicateCheck.sort_values(inplace=True)
    if duplicateCheck.unique().any() == False:
        print("no duplicate values in " + column)
    elif duplicateCheck.unique().any() == True and column != '#libraryId':
        dups = df[duplicateCheck].loc[:,['#libraryId', column]]
        df_dups = pd.DataFrame(dups)
        df_dups.sort_values(inplace=True, by=column)
        print(df_dups)
    elif duplicateCheck.unique().any() == True and column == '#libraryId':
        print(df[duplicateCheck].loc[:,['#libraryId']])

### script - create experiment/library files 

In [None]:
! python3 $path_to_create_exp_script $experiment_id $path_to_output $experiment_type

### can add automation to annotation here

this is where i could write code to make things easier but to discuss details

In [None]:
# this bit will just add your initials for annotator ID
library = pd.read_csv(library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
library.loc[:,'annotatorId'] = 'ANN'
library.to_csv(library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)

### annotation complete - check files and QA

In [None]:
library_file = pd.read_csv(library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
experiment_to_add = pd.read_csv(experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
# choose columns from library file that are needed in main file
library_to_add = library_file[library_cols]

In [None]:
library_to_add.loc[:,'lastModificationDate'] = my_date

#### more QA can be added here - this is minimum to check (columns match, view the file that will be created)

#### check files before adding

In [None]:
# check for duplicate SRSId values (replicates)
dup_check(library, "SRSId")

In [None]:
! git pull

In [None]:
git_library = pd.read_csv(git_library_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)
git_experiment = pd.read_csv(git_experiment_path, sep='\t', index_col=False, keep_default_na=False, na_values=['NULL','null', 'nan','NaN'], dtype=object)

In [None]:
# library file
if set(library_to_add.columns) == set(git_library.columns):
    print('The columns in the library file to append match the columns in the main library file')
else:
    print('The columns in the library file to append DO NOT MATCH the columns in the main library file')

# experiment file
if set(experiment_to_add.columns) == set(git_experiment.columns):
    print('The columns in the experiment file to append match the columns in the main experiment file')
else:
    print('The columns in the experiment file to append DO NOT MATCH the columns in the main experiment file')


# maybe to make this something more like "COLUMNS GOOD - LIBRARY" and "COLUMNS BAD - EXPERIMENT"

In [None]:
library_git_plus_new = pd.concat([git_library, library_to_add], ignore_index = True, sort = False)
## ideally set n to larger than the number of rows you are adding
library_git_plus_new.tail(n = 20)

In [None]:
experiment_git_plus_new = pd.concat([git_experiment, experiment_to_add], ignore_index = True, sort = False)
experiment_git_plus_new.tail()

### add to git

#### stop here - if you make manual changes to your file you need to start again at check new annotations - the next few steps will add to github

In [None]:
! git pull

In [None]:
library_git_plus_new.to_csv(git_library_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
experiment_git_plus_new.to_csv(git_experiment_path, sep="\t", index=False, quoting=csv.QUOTE_ALL)
update_format(git_library_path)
update_format(git_experiment_path)

In [None]:
! git status

In [None]:
! git add $git_experiment_path $git_library_path

In [None]:
# manual message
! git commit -m "add message here (could even automate this if you want)"
# automated message
! git commit -m $commit_message_exp

In [None]:
! git push