# JPIC Data Engineering

In this file Joshua changes a file made by cooper to standardize the time point and replicate naming from files made by cooper

In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import anndata as an
import re

In [2]:
coopers_data_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data.h5ad"
ad = an.read(coopers_data_path)



In [4]:
def extract_timepoint_replicate_2015(data_id):
    match = re.match(r"S(\d+)([ab])", data_id)
    if match:
        time = int(match.group(1))
        replicate = 1 if match.group(2) == 'a' else 2
        return time, replicate
    else:
        return None, None

def extract_timepoint_replicate_2018(data_id):
    match = re.match(r"(\d+)_T(\d+)R(\d+)", data_id)
    if match:
        return int(match.group(2)), int(match.group(3))
    else:
        return None, None


In [5]:
adDs5 = ad[ad.obs['dataset'] == 'chen_2015']
adDs8 = ad[ad.obs['dataset'] == 'liu_2018']

In [6]:
timepoint_replicate = adDs8.obs.index.to_series().apply(extract_timepoint_replicate_2018)
timepoint_replicate_df = timepoint_replicate.apply(pd.Series)
timepoint_replicate_df.columns = ['order', 'replicate']

# Add the new columns to the AnnData object
adDs8.obs = adDs8.obs.join(timepoint_replicate_df)
adDs8

AnnData object with n_obs × n_vars = 48 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [7]:
timepoint_replicate = adDs5.obs.index.to_series().apply(extract_timepoint_replicate_2015)
timepoint_replicate_df = timepoint_replicate.apply(pd.Series)
timepoint_replicate_df.columns = ['order', 'replicate']

# Add the new columns to the AnnData object
adDs5.obs = adDs5.obs.join(timepoint_replicate_df)
adDs5

AnnData object with n_obs × n_vars = 18 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [8]:
adDs5.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S1a,chen_2015,S1a,0.0,0.0,7901832,True,1,1
S1b,chen_2015,S1b,0.0,0.0,8113329,True,1,2
S2a,chen_2015,S2a,0.0,0.0,9831046,False,2,1
S2b,chen_2015,S2b,0.0,0.0,10123271,False,2,2
S3a,chen_2015,S3a,1.0,8.0,10490839,False,3,1
S3b,chen_2015,S3b,1.0,8.0,10713844,False,3,2
S4a,chen_2015,S4a,2.0,16.0,9183324,False,4,1
S4b,chen_2015,S4b,2.0,16.0,9401913,False,4,2
S5a,chen_2015,S5a,3.0,24.0,9655719,False,5,1
S5b,chen_2015,S5b,3.0,24.0,9863515,False,5,2


In [9]:
adDs8.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
63246_T0R1,liu_2018,63246,1.0,-48.0,11940999,True,0,1
63252_T1R1,liu_2018,63252,2.0,0.0,18063509,False,1,1
63249_T2R1,liu_2018,63249,3.0,8.0,11031474,False,2,1
63261_T3R1,liu_2018,63261,1.0,16.0,16761043,False,3,1
63258_T4R1,liu_2018,63258,2.0,24.0,8244802,False,4,1
63255_T5R1,liu_2018,63255,3.0,32.0,10615057,False,5,1
63270_T6R1,liu_2018,63270,1.0,40.0,16486670,False,6,1
63267_T7R1,liu_2018,63267,2.0,48.0,10127547,False,7,1
63264_T8R1,liu_2018,63264,3.0,56.0,11231585,False,8,1
63279_T9R1,liu_2018,63279,1.0,64.0,10781978,False,9,1


In [10]:
adDs_combined = adDs5.concatenate(adDs8, join='outer', index_unique=None)

  adDs_combined = adDs5.concatenate(adDs8, join='outer', index_unique=None)


In [12]:
adDs_combined.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate,batch
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S1a,chen_2015,S1a,0.0,0.0,7901832,True,1,1,0
S1b,chen_2015,S1b,0.0,0.0,8113329,True,1,2,0
S2a,chen_2015,S2a,0.0,0.0,9831046,False,2,1,0
S2b,chen_2015,S2b,0.0,0.0,10123271,False,2,2,0
S3a,chen_2015,S3a,1.0,8.0,10490839,False,3,1,0
...,...,...,...,...,...,...,...,...,...
63275_T11R3,liu_2018,63275,3.0,80.0,13515971,False,11,3,1
63290_T12R3,liu_2018,63290,1.0,88.0,9522866,False,12,3,1
63287_T13R3,liu_2018,63287,2.0,96.0,12370157,False,13,3,1
63284_T14R3,liu_2018,63284,3.0,104.0,10970735,False,14,3,1


In [14]:
adDs_combined

AnnData object with n_obs × n_vars = 66 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate', 'batch'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [13]:
# check that the size of the remerged ann data object is compatible with the size of Cooper's
ad

AnnData object with n_obs × n_vars = 66 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [15]:
out_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad"
adDs_combined.write(out_path)