# Mitotic Exit Dataset

See [Time-resolved single-cell sequencing identifies multiple waves of mRNA decay during the mitosis-to-G1 phase transition](https://elifesciences.org/articles/71356)

In [1]:
import anndata as ad
import pandas as pd
import numpy as np
import os
import pickle

## Load the data

In [2]:
rawdatapath = '/nfs/turbo/umms-indikar/shared/projects/public_data/time_series_RNA/mitoticExit/'
files = ['elife-71356-data1-v2.csv',
         'elife-71356-data3-v2.csv',
         'elife-71356-data2-v2.csv',
         'elife-71356-data4-v2.csv']

In [3]:
plateIds = ['plate_1', 'plate_2', 'plate_3']
counts = {}
for i, file in enumerate(files):
    df = pd.read_csv(os.path.join(rawdatapath, file))
    if i < 3:
        df = df.set_index('GENEID')
        counts[plateIds[i]] = df
    else:
        metadata = df
        metadata = metadata.set_index('Cell_Id')
        metadata = metadata.T

  df = pd.read_csv(os.path.join(rawdatapath, file))


In [4]:
metadata

Cell_Id,Plate_Id,Well_Id,Fucci_g1,Fucci_g2,Cell cycle phase,Cell cycle time
1,plate_1,68,74,23403,G2/M,0
2,plate_1,56,49,23709,G2/M,0
3,plate_1,175,955,74,G1,56.55108562
4,plate_1,53,3,20775,G2/M,0
5,plate_1,130,987,146,G1,57.79340672
...,...,...,...,...,...,...
837,plate_3,366,1504,-13,G1,309.2371724
838,plate_3,251,1216,-17,G1,267.2770725
839,plate_3,200,1177,-62,G1,261.4856624
840,plate_3,230,2483,-48,G1,454.8880935


In [5]:
counts['plate_1']

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,375,376,377,378,379,380,381,382,383,384
GENEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG-AS1__chr19,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
A1BG__chr19,0.00,0.00,0.00,0.00,0.0,0.00,1.00,1.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
A1CF__chr10,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
A2M-AS1__chr12,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
A2ML1__chr12,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.00,0.00,0.00,0.00,0.0,3.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0
ZYG11B__chr1,0.00,3.00,1.00,3.00,0.0,1.00,1.00,2.00,4.00,0.0,...,4.00,5.00,0.0,0.0,0.00,0.00,0.0,1.0,0.0,0.0
ZYX__chr7,14.02,11.01,37.16,10.01,2.0,35.15,33.13,9.01,69.58,3.0,...,61.45,68.56,4.0,2.0,32.12,14.02,0.0,6.0,1.0,0.0
ZZEF1__chr17,1.00,0.00,3.00,1.00,0.0,1.00,1.00,0.00,1.00,0.0,...,1.00,1.00,0.0,0.0,0.00,1.00,0.0,1.0,0.0,0.0


## Merge the data

In [6]:
singleCells = pd.DataFrame()
for i in range(metadata.shape[0]):  # Iterate over rows, not columns
    plate = metadata['Plate_Id'][i]
    well  = int(metadata['Well_Id'][i]) - 1
    df = counts[plate]
    if singleCells.empty:  # Initialize singleCells with the indices of the first df
        singleCells = pd.DataFrame(index=df.index)
    
    # Reindex df to match the indices of singleCells
    df_reindexed = df.reindex(singleCells.index)
    
    # Add the new column to singleCells
    singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]



  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_reindexed.iloc[:, well]
  singleCells[f'{plate}_Well{well+1}'] = df_rein

## Pack as an Ann Data

In [7]:
metadata['Index'] = singleCells.columns
metadata = metadata.set_index('Index')

adata = ad.AnnData(X=singleCells.T, obs=metadata)

# Print AnnData object summary
print(adata)


AnnData object with n_obs × n_vars = 841 × 15957
    obs: 'Plate_Id', 'Well_Id', 'Fucci_g1', 'Fucci_g2', 'Cell cycle phase', 'Cell cycle time'


In [8]:
adata.obs

Cell_Id,Plate_Id,Well_Id,Fucci_g1,Fucci_g2,Cell cycle phase,Cell cycle time
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
plate_1_Well68,plate_1,68,74,23403,G2/M,0
plate_1_Well56,plate_1,56,49,23709,G2/M,0
plate_1_Well175,plate_1,175,955,74,G1,56.55108562
plate_1_Well53,plate_1,53,3,20775,G2/M,0
plate_1_Well130,plate_1,130,987,146,G1,57.79340672
...,...,...,...,...,...,...
plate_3_Well366,plate_3,366,1504,-13,G1,309.2371724
plate_3_Well251,plate_3,251,1216,-17,G1,267.2770725
plate_3_Well200,plate_3,200,1177,-62,G1,261.4856624
plate_3_Well230,plate_3,230,2483,-48,G1,454.8880935


In [9]:
adata.var.index

Index(['A1BG-AS1__chr19 ', 'A1BG__chr19 ', 'A1CF__chr10 ', 'A2M-AS1__chr12 ',
       'A2ML1__chr12 ', 'A2M__chr12 ', 'A4GALT__chr22 ', 'AAAS__chr12 ',
       'AACSP1__chr5 ', 'AACS__chr12 ',
       ...
       'ZWILCH__chr15 ', 'ZWINT__chr10 ', 'ZXDA__chrX ', 'ZXDB__chrX ',
       'ZXDC__chr3 ', 'ZYG11A__chr1 ', 'ZYG11B__chr1 ', 'ZYX__chr7 ',
       'ZZEF1__chr17 ', 'ZZZ3__chr1 '],
      dtype='object', name='GENEID', length=15957)

## Build Vars DataFrame

In [10]:
pickle_file = '/nfs/turbo/umms-indikar/shared/projects/geneformer/gene_names.pkl'

# Load the data from the pickle file
with open(pickle_file, 'rb') as f:
    gene_names = pickle.load(f)
genelist = list(adata.var.index.values)
for i, gene in enumerate(genelist):
    genename = gene.split('_')[0]
    genelist[i] = genename

In [11]:
ensembl_ids = []
for gene in list(adata.var.index.values):
    genename = gene.split('_')[0]
    if genename in gene_names:
        ensg = gene_names[genename]
    else:
        ensg = None
    ensembl_ids.append(ensg)
adata.var['ENSEMBLID'] = ensembl_ids

In [12]:
adata.var['gene_name_raw'] = adata.var.index
adata.var


Unnamed: 0_level_0,ENSEMBLID,gene_name_raw
GENEID,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG-AS1__chr19,,A1BG-AS1__chr19
A1BG__chr19,ENSG00000121410,A1BG__chr19
A1CF__chr10,ENSG00000148584,A1CF__chr10
A2M-AS1__chr12,,A2M-AS1__chr12
A2ML1__chr12,ENSG00000166535,A2ML1__chr12
...,...,...
ZYG11A__chr1,ENSG00000203995,ZYG11A__chr1
ZYG11B__chr1,ENSG00000162378,ZYG11B__chr1
ZYX__chr7,ENSG00000159840,ZYX__chr7
ZZEF1__chr17,ENSG00000074755,ZZEF1__chr17


In [13]:
none_count = ensembl_ids.count(None)

# Print the count
print(f"Percent of missing ensembl_ids: {none_count/len(ensembl_ids)}")


Percent of missing ensembl_ids: 0.1829917904367989


In [14]:
gene_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/token_mappings.csv"
genes = pd.read_csv(gene_path)
genes

Unnamed: 0,gene_id,token_id,gene_name,nonzero_median,gene_version,gene_biotype,Chromosome,Start,End,scenic_tf
0,<pad>,0,,,,,,,,False
1,<mask>,1,,,,,,,,False
2,ENSG00000000003,2,TSPAN6,2.001186,15.0,protein_coding,X,100627107.0,100639991.0,False
3,ENSG00000000005,3,TNMD,3.228213,6.0,protein_coding,X,100584935.0,100599885.0,False
4,ENSG00000000419,4,DPM1,2.218874,14.0,protein_coding,20,50934866.0,50959140.0,False
...,...,...,...,...,...,...,...,...,...,...
25421,ENSGR0000197976,25421,,2.229061,,,,,,False
25422,ENSGR0000198223,25422,,1.900695,,,,,,False
25423,ENSGR0000205755,25423,,1.460207,,,,,,False
25424,ENSGR0000214717,25424,,2.085950,,,,,,False


In [15]:
var = pd.merge(adata.var, genes, how='left',
               left_on='ENSEMBLID',
               right_on='gene_id',
              )
var = var.set_index('gene_name_raw')
var['ensembl_id'] = var['gene_id']
var = var.drop('ENSEMBLID', axis=1)
var['ref'] = 'Time-resolved single-cell sequencing identifies multiple waves of mRNA decay during the mitosis-to-G1 phase transition'
var

Unnamed: 0_level_0,gene_id,token_id,gene_name,nonzero_median,gene_version,gene_biotype,Chromosome,Start,End,scenic_tf,ensembl_id,ref
gene_name_raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A1BG-AS1__chr19,,,,,,,,,,,,Time-resolved single-cell sequencing identifie...
A1BG__chr19,ENSG00000121410,5150.0,A1BG,1.901471,12.0,protein_coding,19,58345177.0,58353492.0,False,ENSG00000121410,Time-resolved single-cell sequencing identifie...
A1CF__chr10,ENSG00000148584,9064.0,A1CF,5.672083,15.0,protein_coding,10,50799408.0,50885675.0,False,ENSG00000148584,Time-resolved single-cell sequencing identifie...
A2M-AS1__chr12,,,,,,,,,,,,Time-resolved single-cell sequencing identifie...
A2ML1__chr12,ENSG00000166535,11812.0,A2ML1,3.648021,20.0,protein_coding,12,8822620.0,8887001.0,False,ENSG00000166535,Time-resolved single-cell sequencing identifie...
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,ENSG00000203995,17515.0,ZYG11A,2.035365,10.0,protein_coding,1,52842510.0,52894998.0,False,ENSG00000203995,Time-resolved single-cell sequencing identifie...
ZYG11B__chr1,ENSG00000162378,10655.0,ZYG11B,3.338822,13.0,protein_coding,1,52726452.0,52827336.0,False,ENSG00000162378,Time-resolved single-cell sequencing identifie...
ZYX__chr7,ENSG00000159840,10336.0,ZYX,2.862519,16.0,protein_coding,7,143381294.0,143391111.0,False,ENSG00000159840,Time-resolved single-cell sequencing identifie...
ZZEF1__chr17,ENSG00000074755,1277.0,ZZEF1,3.290736,15.0,protein_coding,17,4004444.0,4143030.0,False,ENSG00000074755,Time-resolved single-cell sequencing identifie...


In [16]:
adata.var = var

In [19]:
adata

AnnData object with n_obs × n_vars = 841 × 15957
    obs: 'Plate_Id', 'Well_Id', 'Fucci_g1', 'Fucci_g2', 'Cell cycle phase', 'Cell cycle time'
    var: 'gene_id', 'token_id', 'gene_name', 'nonzero_median', 'gene_version', 'gene_biotype', 'Chromosome', 'Start', 'End', 'scenic_tf', 'ensembl_id', 'ref'

In [43]:
file_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/mitoticExit.h5ad"

# Write the AnnData object to the specified file
adata.var = adata.var.astype(str)
adata.var = adata.var.applymap(str)
adata.obs = adata.obs.astype(str)
adata.obs = adata.obs.applymap(str)

adata.X = adata.X.astype(np.float32)
adata.X

adata.write(file_path)


In [42]:
adata.X = adata.X.astype(np.float32)
adata.X

array([[  0.  ,   0.  ,   0.  , ..., 116.63,   1.  ,   8.01],
       [  0.  ,   0.  ,   1.  , ..., 221.88,   1.  ,   5.  ],
       [  0.  ,   1.  ,   0.  , ...,  93.04,   0.  ,   6.  ],
       ...,
       [   nan,    nan,    nan, ...,    nan,    nan,    nan],
       [   nan,    nan,    nan, ...,    nan,    nan,    nan],
       [   nan,    nan,    nan, ...,    nan,    nan,    nan]],
      dtype=float32)

In [32]:
adata.X = np.nan_to_num(adata.X, nan=-1)

In [33]:
adata.X

array([[0.0, 0.0, 0.0, ..., 116.63, 1.0, 8.01],
       [0.0, 0.0, 1.0, ..., 221.88, 1.0, 5.0],
       [0.0, 1.0, 0.0, ..., 93.04, 0.0, 6.0],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)