<a href="https://colab.research.google.com/github/Aitslab/CellDeathCensus/blob/main/SupplementalData/SupplementalFile3_CellDeathCensus_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import datetime
import csv

# Get today's date for file naming
today_date = datetime.datetime.today().strftime('%Y%m%d')

In [32]:
# Specify paths to files for merging
# Path to main directory
dir = '/content/drive/MyDrive/'

# Deathbase
path_db = dir+'Deathbase/deathbase_20240404.tsv'

# ApoCanD
path_apocan = dir+'ApoCanD/ApoCanD_20240404.tsv'

#LncPCD
path_lnc = dir+'LncPCD/LncPCD_withduplicates_20240404.tsv'

# MCDB
path_mcdb = dir+'MCDB/MCDB_withduplicates_20240404.tsv'

# ncRDeathDB V2
path_ncrd = dir+'ncRDeathDB_V2/ncRDeathDB_V2_withduplicates_20240328.tsv'
path_ncrd_targets = dir+'ncRDeathDB_V2/ncRDeathDB_V2_Targets_withduplicates_20240328.tsv'

# FerrDb V2
path_ferr = dir+'FerrDB_V2/FerrDB_V2_withduplicates_20240328.tsv'

# FerrDb V2 extended data
path_ferr_ext = dir+'FerrDB_V2/FerrDB_V2_extendeddata_20240328.tsv'

# yApoptosis
path_yapo = dir+'yApoptosis/yApoptosis_20240404.tsv'

# GO
path_go = dir+'GO/GO_withduplicates_20240404.tsv'

# UniProt
path_up = dir+'UniProt/UniProt_mergewithduplicates_20240404.tsv'

# iPCD
path_ipcd = dir+'iPCD/iPCD_20240404.tsv'

# XDeathDB
path_xdeathdb = dir+'XDeathDB/XDeathDB_withduplicates_20240404.tsv'

# RCD Map
path_rcd = dir+'RCDMap/RCDMap_withduplicates_20240404.tsv'

# Create lists to hold the dataframes and a dictionary for the database name mappings
filelist = [path_apocan, path_db, path_ferr, path_ferr_ext, path_go, path_ipcd, path_lnc, path_mcdb, path_ncrd, path_ncrd_targets, path_rcd, path_up, path_yapo, path_xdeathdb]
df_list = []
mappings = {path_apocan: 'ApoCanD',
            path_db: 'Deathbase',
            path_ferr: 'FerrDB_V2',
            path_ferr_ext: 'FerrDB_V2_extendeddata',
            path_go: 'GO (AmiGO 2)',
            path_ipcd: 'iPCD',
            path_lnc: 'LncPCD',
            path_mcdb: 'MCDB',
            path_ncrd: 'ncRDeathDB V2',
            path_ncrd_targets: 'ncRDeathDB V2 (Targets)',
            path_rcd: 'RCDMap',
            path_up: 'UniProt',
            path_yapo: 'yApoptosis',
            path_xdeathdb: 'XDeathDB'}




# Load all databases and add to a list
for path in filelist:

    # Read the file into a pandas DataFrame
    df = pd.read_csv(path, sep='\t', dtype=str)
    df['Database'] = mappings[path]
    print(mappings[path])
    print(df.columns)
    print(len(df.columns))
    print()
    print(df[0:5])
    print('----------------------------------------------------------------------------------------------------------------------')

    # Append the DataFrame to the list
    df_list.append(df)

# Merge all dataframes
merge = pd.concat(df_list, ignore_index=True)
print('Merged Dataframe')
print(merge.columns)
print(len(merge.columns))

# Display the dataframe length
print('----------------------------------------------------------------------')
print('----------------------------------------------------------------------')
print('CellDeathCensus (all columns, with duplicates) DataFrame:')
print('Number of entries: ' + str(len(merge)))
print('----------------------------------------------------------------------')

# Save to tsv file on Google drive
file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_allcolumns_withduplicates_{today_date}.tsv'  # Specify your desired file path
merge.to_csv(file_path, sep='\t', index=False)

ApoCanD
Index(['Symbol', 'UniProt_AC', 'Ensembl', 'PDB', 'Link_UniProt',
       'Link_Deathbase', 'Link_PDB', 'Species', 'Pathway', 'Database'],
      dtype='object')
10

   Symbol UniProt_AC          Ensembl                 PDB  \
0   AIFM1     O95831  ENSP00000287295                1M6I   
1     AK2     P54819  ENSP00000346921                2C9Y   
2  ANP32A     P39687  ENSP00000417864                2JE0   
3   APAF1     O14727  ENSP00000353059                1C15   
4    AVEN     Q9NQS1  ENSP00000306822  Modelled Structure   

                            Link_UniProt  \
0  http://www.uniprot.org/uniprot/O95831   
1  http://www.uniprot.org/uniprot/P54819   
2  http://www.uniprot.org/uniprot/P39687   
3  http://www.uniprot.org/uniprot/O14727   
4  http://www.uniprot.org/uniprot/Q9NQS1   

                                      Link_Deathbase  \
0  http://deathbase.org/protein_report.php?id=H_s...   
1  http://deathbase.org/protein_report.php?id=H_s...   
2  http://deathbase.org/prote

In [36]:
# Analyse content
# Load merged data into dataframe
census_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_allcolumns_withduplicates_{today_date}.tsv'  # Specify your desired file path
df = pd.read_csv(census_path, sep='\t', dtype=str)
print(df.info())


# Clean species-related data
df['Species'] = df['Species'].str.lower() # Change Species column to lower case
df['Species'] = df['Species'].replace('', pd.NA) # Replace empty values with NA

# Count unique Species
top_100_species = df['Species'].value_counts().head(100)
print("Top 100 Species count before name unification:")
print(top_100_species[0:51])
print(top_100_species[51:])
print('----------------------------------------------------------------------')

#Remove alternate names in 'Species' for UniProt for the most frequent species
species = {'homo sapiens (human)': 'homo sapiens',
           'rattus norvegicus (rat)': 'rattus norvegicus',
           'mus musculus (mouse)': 'mus musculus',
           'bos taurus (bovine)': 'bos taurus',
           'danio rerio (zebrafish) (brachydanio rerio)': 'danio rerio',
           'pongo abelii (sumatran orangutan) (pongo pygmaeus abelii)': 'pongo abelii',
           'gallus gallus (chicken)': 'gallus gallus',
           'sus scrofa (pig)': 'sus scrofa',
           'drosophila melanogaster (fruit fly)': 'drosophila melanogaster',
           'xenopus laevis (african clawed frog)': 'xenopus laevis'
           } # Dictionary mapping for species

# Replace values in the 'Species' column using the mapping
df['Species'] = df['Species'].replace(species)

# Fill in missing Species and NCBI_TaxID data by creating a mapping from existing values
# Create a dataframe for mapping
mapping = df[['NCBI_TaxID', 'Species']].copy()
mapping.dropna(subset=['NCBI_TaxID', 'Species'], how='any', inplace=True) # Drop rows where either Species or NCBI_TaxID is empty

# Create mapping dictionaries from the dataframe (Note the last pairing is used in case of multiple matches)
mapping_dict = mapping.set_index('Species')['NCBI_TaxID'].to_dict()
reverse_mapping_dict = mapping.set_index('NCBI_TaxID')['Species'].to_dict()

# Fill in missing values from the mapping dictionaries
df['NCBI_TaxID'] = df.apply(lambda row: mapping_dict.get(row['Species'], row['NCBI_TaxID']) if pd.isna(row['NCBI_TaxID']) else row['NCBI_TaxID'], axis=1) # Fill missing values in NCBI_TaxID column based on existing Species
df['Species'] = df.apply(lambda row: reverse_mapping_dict.get(row['NCBI_TaxID'], row['Species']) if pd.isna(row['Species']) else row['Species'], axis=1) # Fill missing values in Species column based on existing NCBI_TaxID

# Save updated dataframe to tsv file on Google drive
file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_allcolumns_withduplicates_afterspeciesmapping_{today_date}.tsv'  # Specify your desired file path
df.to_csv(file_path, sep='\t', index=False)

# Count unique Species after name unification and mapping
top_50_species = df['Species'].value_counts().head(50)
print("Top 50 Species count after name unification:")
print(top_50_species)
print('----------------------------------------------------------------------')

file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_SpeciesTop50_{today_date}.tsv'  # Specify your desired file path
top_50_species.to_csv(file_path, sep='\t', header=['Count'], index_label='Species')



# Count unique UniProt_ACs and save to tsv file
uniprot = df['UniProt_AC'].drop_duplicates()

print('CellDeathCensus (UniProtAC, without duplicates) DataFrame:')
print('Number of entries: ' + str(len(uniprot)))
print('----------------------------------------------------------------------')

file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_UniProtAC_{today_date}.tsv'  # Specify your desired file path
uniprot.to_csv(file_path, sep='\t', index=False)



# Count unique Symbol+Species pairs and save to tsv file
symbol_species = df[['Symbol', 'Species']].drop_duplicates()

print('CellDeathCensus (Symbol+Species pairs, without duplicates) DataFrame:')
print('Number of entries: ' + str(len(symbol_species)))
print('----------------------------------------------------------------------')

file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_Symbols_Species_{today_date}.tsv'  # Specify your desired file path
symbol_species.to_csv(file_path, sep='\t', index=False)



# Count unique Symbol+NCBI_TaxID pairs and save to tsv file
symbol_taxid = df[['Symbol', 'NCBI_TaxID']].drop_duplicates()

print('CellDeathCensus (Symbol+NCBI_TaxID pairs, without duplicates) DataFrame:')
print('Number of entries: ' + str(len(symbol_taxid)))
print('----------------------------------------------------------------------')

file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_Symbols_TaxID_{today_date}.tsv'  # Specify your desired file path
symbol_taxid.to_csv(file_path, sep='\t', index=False)



# Count frequency of cell death types in Pathway column
terms = ['cuproptosis',
         'disulfidptosis',
         'pyroptosis',
         'necroptosis',
         'ferroptosis',
         'apoptosis',
         'necrosis',
         'mitotic',
         'autosis',
         'MPT',
         'parthanatos',
         'lysosomal',
         'efferocytosis',
         'anoikis',
         'alkaliptosis',
         'excitotoxicity',
         'entosis',
         'immunogenic',
         'NETosis',
         'paraptosis',
         'cornification',
         'Hypersensitive response',
         'Wallerian degeneration',
         'PANoptosis',
         'Phagoptosis',
         'Oncosis',
         'Phenoptosis',
         'Heterokaryon incompatibility']

# Dictionary to store counts for each term
term_counts = {term: 0 for term in terms}

# Function to count occurrences of a term in 'Pathway' column
def count_term_occurrences(term):
    return df['Pathway'].apply(lambda x: term.lower() in str(x).lower()).sum()

# Count occurrences for each term
for term in terms:
    term_counts[term] = count_term_occurrences(term)

# Sort term counts by count in descending order
sorted_term_counts = sorted(term_counts.items(), key=lambda x: x[1], reverse=True)
print(sorted_term_counts)

# Save results to a TSV file
file_path = f'/content/drive/MyDrive/CellDeathCensus/CellDeathCensus_Pathways_{today_date}.tsv'  # Specify your desired file path

with open(file_path, 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(['Term', 'Count'])  # Write header
    for term, count in sorted_term_counts:
        writer.writerow([term, count])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49544 entries, 0 to 49543
Data columns (total 86 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Symbol                                 48690 non-null  object
 1   UniProt_AC                             23594 non-null  object
 2   Ensembl                                13143 non-null  object
 3   PDB                                    82 non-null     object
 4   Link_UniProt                           82 non-null     object
 5   Link_Deathbase                         82 non-null     object
 6   Link_PDB                               82 non-null     object
 7   Species                                48278 non-null  object
 8   Pathway                                31328 non-null  object
 9   Database                               49544 non-null  object
 10  Synonyms                               19758 non-null  object
 11  Description    