# Human Disease Ontology
---
### Data for this analysis comes from Wormbase

https://wormbase.org/tools/ontology_browser/

![Image](./input_data/Wormbase-screenshot.png)

--- 
### Summary of findings


* 3.0% of __Unassigned Genes__ are related to Human Disease.
* 6.0% of __Poorly Annotated Genes__ are related to Human Disease.
* 11.0% of __All Worm Genes__ are related to Human Disease.


__Notes:__

* We define ___Poorly Annotated Genes (PAG)___ as Genes in Category 1 of WormCat with the descriptions ['Unassigned','Transmembrane protein', 'Transmembrane transport']

In [None]:
import pandas as pd

In [None]:
# Read in the Disease Ontology
# The data has comes from Wormbase
xlsx_file_nm = './input_data/Human_Disease_Ontology.xlsx'
human_disease_xlsx = pd.ExcelFile(xlsx_file_nm)

In [None]:
# Lets take a look at the sheets that are in this Excel
sheet_names = human_disease_xlsx.sheet_names
sheet_names

In [None]:
# Create a dictionary of each human disease with the aligned Wormbase_ids
human_disease_dfs = {}
for sheet_name in sheet_names:
    human_disease_dfs[sheet_name] = pd.read_excel(xlsx_file_nm, sheet_name=sheet_name)
    human_disease_dfs[sheet_name] = human_disease_dfs[sheet_name].rename(columns={'disease_type': sheet_name})
    human_disease_dfs[sheet_name][sheet_name]=True
    human_disease_dfs[sheet_name] = human_disease_dfs[sheet_name].drop(columns=['DOID', 'descendant terms'])
    


In [None]:
#human_disease_dfs

In [None]:
# merge the individual dataframes of disease into a single dataframe
merged_df = pd.merge(human_disease_dfs[sheet_names[0]], human_disease_dfs[sheet_names[1]], on='wormbase_id', how='outer')
print(sheet_names[0])
print(sheet_names[1])
for index in range(2,len(sheet_names)):
    print(sheet_names[index])
    merged_df = pd.merge(merged_df, human_disease_dfs[sheet_names[index]], on='wormbase_id', how='outer')
    
len(merged_df)

In [None]:
merged_df = merged_df.rename(columns={'wormbase_id':'wormbase_id_disease'})
merged_df

In [None]:
# Load the wormbase category list
wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv') 
wormcat_df = wormcat_df.rename(columns={'Sequence ID':'sequence_id','Wormbase ID':'wormbase_id','Category 1':'category_1','Category 2':'category_2','Category 3':'category_3'})
wormcat_df.columns

In [None]:
# Select the Unassigned Genes
unassigned = wormcat_df.query("category_1 == 'Unassigned'")['wormbase_id']
unassigned_df = unassigned.to_frame()
print(f"We have {len(unassigned_df):,} unassigned genes.")

In [None]:
unassigned_merged_df = pd.merge(unassigned_df, merged_df, left_on='wormbase_id', right_on='wormbase_id_disease', how='left')
unassigned_merged_df


In [None]:
unassigned_diseased = unassigned_merged_df[~unassigned_merged_df.wormbase_id_disease.isnull()]
unassigned_diseased

In [None]:
# Lets get the poorly annotated genes
#Unassigned, transmembrane protein and transmembrane transporter
pag_series = wormcat_df.query("category_1 in ['Unassigned','Transmembrane protein', 'Transmembrane transport']")['wormbase_id']
pag_df = pag_series.to_frame()
print(f"We have {len(pag_df):,} PAG's.")

In [None]:
pag_merged_df = pd.merge(pag_series, merged_df, left_on='wormbase_id', right_on='wormbase_id_disease', how='left')
pag_merged_df

In [None]:
pag_diseased = pag_merged_df[~pag_merged_df.wormbase_id_disease.isnull()]
pag_diseased

## Create Visualizations

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def get_data_dict(merged_df):
    print(f' Disease{"":<21} Count')
    print('-'*40)
    data = {}
    for sheet_name in sheet_names:
        data[sheet_name]=list(merged_df[sheet_name].value_counts())[0]
        print(f" {sheet_name:<30} {data[sheet_name]:>6,}")
    return data

def create_plot_disease_ontology(data, y_title):
    sorted_dict = dict(sorted(data.items(), key=lambda x: x[1]*-1))
    x = sorted_dict.keys()
    y = sorted_dict.values()

    fig, ax = plt.subplots(figsize=(10, 5))
    # Create bar plot
    plt.bar(x, y)

    # Set title and labels
    plt.title(f"Human Disease Ontology",fontsize=15)
    plt.ylabel(y_title)
    plt.xlabel("Human Disease")
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Show plot
    plt.show()


In [None]:
print(f"{round(len(unassigned_merged_df[unassigned_merged_df['wormbase_id_disease'].notnull()])/len(unassigned_merged_df),2)*100}% of Unassigned Genes are related to Human Disease.")
print(f"{round(len(pag_merged_df[pag_merged_df['wormbase_id_disease'].notnull()])/len(pag_merged_df),2)*100}% of Poorly Annotated Genes are related to Human Disease.")

wormcat_merged_df = pd.merge(wormcat_df, merged_df, left_on='wormbase_id', right_on='wormbase_id_disease', how='left')
print(f"{round(len(wormcat_merged_df[wormcat_merged_df['wormbase_id_disease'].notnull()])/len(wormcat_merged_df),2)*100}% of All Worm Genes are related to Human Disease.")

In [None]:
data = get_data_dict(unassigned_merged_df)    
create_plot_disease_ontology(data, "Unassigned Occurances")    

In [None]:
data = get_data_dict(wormcat_merged_df)    
create_plot_disease_ontology(data, "Worm Genes Occurances")    

In [None]:
data = get_data_dict(pag_merged_df)    
create_plot_disease_ontology(data, "Poorly Annotated Genes Occurances")  

# Appendix

Utility functions

In [None]:
%%bash
echo "Start image resize"

# Replace file name with the name of file to be resized
file_dir='./input_data'
file_nm='Wormbase-screenshot.png'
full_path=`echo ${file_dir}/${file_nm}`
echo ${full_path}

pic_size=`identify -format "%[fx:w]x%[fx:h]" ${full_path}`
echo $pic_size

if [[ "${pic_size}" == "936x544" ]]; then
    echo image ${pic_size} resized
    full_tmp_path=`echo ${file_dir}/tmp_${file_nm}`
    echo convert ${full_path} -resize 40% ${full_tmp_path}
    echo mv ${full_tmp_path} ${file_nm}
else
    echo image already resized ${pic_size}
fi

In [None]:
!convert ./input_data/Wormbase-screenshot.png -resize 50% ./input_data/tmp_Wormbase-screenshot.png


# Appendix

In [None]:
# Convert Microsoft Documents to pdf for easy reading in Jupyter
#!libreoffice --convert-to pdf "./Analysis/Graphics_for_Assignment1.pptx"
!libreoffice --convert-to pdf "./admin_sup2.docx"