In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
s1_s2_diseases = pd.read_csv('./genecards/s1_s2_diseases.csv')
s2_s3_diseases = pd.read_csv('./genecards/s2_s3_diseases.csv')
s3_s4_diseases = pd.read_csv('./genecards/s3_s4_diseases.csv')

In [None]:
print(f's1_s2_diseases shape: {s1_s2_diseases.shape}')
print(f's2_s3_diseases shape: {s2_s3_diseases.shape}')
print(f's3_s4_diseases shape: {s3_s4_diseases.shape}')

In [None]:
print(f'f1_s2_diseases columns: {s1_s2_diseases.columns}')
print(f'f2_s3_diseases columns: {s2_s3_diseases.columns}')
print(f'f3_s4_diseases columns: {s3_s4_diseases.columns}')

In [None]:
# Set a threshold for significance
threshold = 0.5

# Identify common significant diseases
common_diseases = set(s1_s2_diseases[s1_s2_diseases['Score'] >= threshold]['Name']).intersection(
    set(s2_s3_diseases[s2_s3_diseases['Score'] >= threshold]['Name'])).intersection(
    set(s3_s4_diseases[s3_s4_diseases['Score'] >= threshold]['Name']))

In [None]:
# Track changes in score and number of genes matched
def track_changes(df, common_diseases, stage):
    df_common = df[df['Name'].isin(common_diseases)].copy()
    df_common.set_index('Name', inplace=True)
    df_common.rename(columns={'Score': f'Score_{stage}', '# Matched Genes': f'Matched Genes_{stage}'}, inplace=True)
    return df_common

s1_s2_common = track_changes(s1_s2_diseases, common_diseases, 's1_s2')
s2_s3_common = track_changes(s2_s3_diseases, common_diseases, 's2_s3')
s3_s4_common = track_changes(s3_s4_diseases, common_diseases, 's3_s4')

# Merge the data for comparison
merged_common_scores = s1_s2_common[['Score_s1_s2']].merge(s2_s3_common[['Score_s2_s3']], left_index=True, right_index=True)
merged_common_scores = merged_common_scores.merge(s3_s4_common[['Score_s3_s4']], left_index=True, right_index=True)

merged_common_genes = s1_s2_common[['Matched Genes_s1_s2']].merge(s2_s3_common[['Matched Genes_s2_s3']], left_index=True, right_index=True)
merged_common_genes = merged_common_genes.merge(s3_s4_common[['Matched Genes_s3_s4']], left_index=True, right_index=True)

In [None]:
# Create heatmaps for significant diseases
plt.figure(figsize=(14, 7))
sns.heatmap(merged_common_scores, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Score'})
plt.title('Heatmap of Significant Disease Scores Over Time')
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
sns.heatmap(merged_common_genes, annot=True, cmap="YlGnBu", cbar_kws={'label': '# Matched Genes'})
plt.title('Heatmap of Number of Matched Genes for Significant Diseases Over Time')
plt.show()

In [None]:
# Identify new significant diseases
def new_significant_diseases(current_df, previous_df, threshold):
    new_diseases = set(current_df['Name']) - set(previous_df['Name'])
    significant_new_diseases = current_df[(current_df['Name'].isin(new_diseases)) & (current_df['Score'] >= threshold)]
    return significant_new_diseases

new_significant_s2_s3 = new_significant_diseases(s2_s3_diseases, s1_s2_diseases, threshold)
new_significant_s3_s4 = new_significant_diseases(s3_s4_diseases, s2_s3_diseases, threshold)

# Filter for more significant diseases
significant_threshold = 2.5
new_significant_s2_s3 = new_significant_s2_s3[new_significant_s2_s3['Score'] >= significant_threshold]
new_significant_s3_s4 = new_significant_s3_s4[new_significant_s3_s4['Score'] >= significant_threshold]

# Visualize new significant diseases
plt.figure(figsize=(10, 10))
sns.barplot(data=new_significant_s2_s3, x='Score', y='Name', palette='viridis')
plt.title('New Significant Diseases in s2_s3')
plt.xlabel('Score')
plt.ylabel('Disease Name')
plt.show()

In [None]:
plt.figure(figsize=(10, 20))
sns.barplot(data=new_significant_s3_s4, x='Score', y='Name', palette='viridis')
plt.title('New Significant Diseases in s3_s4')
plt.xlabel('Score')
plt.ylabel('Disease Name')
plt.show()

In [None]:
# Calculate the change in matched genes
merged_common_genes['Change_s1_s2_to_s2_s3'] = merged_common_genes['Matched Genes_s2_s3'] - merged_common_genes['Matched Genes_s1_s2']
merged_common_genes['Change_s2_s3_to_s3_s4'] = merged_common_genes['Matched Genes_s3_s4'] - merged_common_genes['Matched Genes_s2_s3']

# Calculate the absolute changes and filter top significant changes
merged_common_genes['Abs_Change_s1_s2_to_s2_s3'] = merged_common_genes['Change_s1_s2_to_s2_s3'].abs()
merged_common_genes['Abs_Change_s2_s3_to_s3_s4'] = merged_common_genes['Change_s2_s3_to_s3_s4'].abs()

# Select top diseases with the most significant changes
top_diseases = merged_common_genes.nlargest(30, ['Abs_Change_s1_s2_to_s2_s3', 'Abs_Change_s2_s3_to_s3_s4'])

# Plot the trends using line plots for top significant changes
plt.figure(figsize=(14, 7))
for disease in top_diseases.index:
    plt.plot(['s1_s2', 's2_s3', 's3_s4'], top_diseases.loc[disease, ['Matched Genes_s1_s2', 'Matched Genes_s2_s3', 'Matched Genes_s3_s4']], label=disease)
plt.xlabel('Stage Comparison')
plt.ylabel('Number of Matched Genes')
plt.title('Trend of Number of Matched Genes for Top Significant Diseases Over Time')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Print the changes in text
print("Trend of Number of Matched Genes for Top Significant Diseases Over Time:")
for disease in top_diseases.index:
    disease_name = merged_common_genes.index[disease]  # Retrieve the actual disease name correctly
    matched_genes_s1_s2 = merged_common_genes.loc[disease_name, 'Matched Genes_s1_s2'] if 'Matched Genes_s1_s2' in merged_common_genes.columns else None
    matched_genes_s2_s3 = merged_common_genes.loc[disease_name, 'Matched Genes_s2_s3'] if 'Matched Genes_s2_s3' in merged_common_genes.columns else None
    matched_genes_s3_s4 = merged_common_genes.loc[disease_name, 'Matched Genes_s3_s4'] if 'Matched Genes_s3_s4' in merged_common_genes.columns else None
    
    change_s1_s2_to_s2_s3 = matched_genes_s2_s3 - matched_genes_s1_s2 if pd.notnull(matched_genes_s1_s2) and pd.notnull(matched_genes_s2_s3) else 'N/A'
    change_s2_s3_to_s3_s4 = matched_genes_s3_s4 - matched_genes_s2_s3 if pd.notnull(matched_genes_s2_s3) and pd.notnull(matched_genes_s3_s4) else 'N/A'
    
    print(f"Disease: {disease_name}")
    print(f"  Matched Genes in s1_s2: {matched_genes_s1_s2 if pd.notnull(matched_genes_s1_s2) else 'N/A'}")
    print(f"  Matched Genes in s2_s3: {matched_genes_s2_s3 if pd.notnull(matched_genes_s2_s3) else 'N/A'}")
    print(f"  Matched Genes in s3_s4: {matched_genes_s3_s4 if pd.notnull(matched_genes_s3_s4) else 'N/A'}")
    print(f"  Change from s1_s2 to s2_s3: {change_s1_s2_to_s2_s3}")
    print(f"  Change from s2_s3 to s3_s4: {change_s2_s3_to_s3_s4}")


In [None]:
# Calculate the change in matched genes
merged_common_genes['Change_s1_s2_to_s2_s3'] = merged_common_genes['Matched Genes_s2_s3'] - merged_common_genes['Matched Genes_s1_s2']
merged_common_genes['Change_s2_s3_to_s3_s4'] = merged_common_genes['Matched Genes_s3_s4'] - merged_common_genes['Matched Genes_s2_s3']

significant_changes = merged_common_genes[['Change_s1_s2_to_s2_s3', 'Change_s2_s3_to_s3_s4']].sort_values(by=['Change_s1_s2_to_s2_s3', 'Change_s2_s3_to_s3_s4'], ascending=False).head(10)

# Bar plot for significant changes
plt.figure(figsize=(10, 7))
significant_changes.plot(kind='bar', stacked=True, cmap='viridis')
plt.xlabel('Disease')
plt.ylabel('Change in Number of Matched Genes')
plt.title('Significant Changes in Number of Matched Genes for Top 10 Diseases')
plt.legend(loc='upper left')
plt.show()

In [None]:
# Identify new significant diseases
def new_significant_diseases(current_df, previous_df, threshold):
    new_diseases = set(current_df['Name']) - set(previous_df['Name'])
    significant_new_diseases = current_df[(current_df['Name'].isin(new_diseases)) & (current_df['Score'] >= threshold)]
    return significant_new_diseases

new_significant_s2_s3 = new_significant_diseases(s2_s3_diseases, s1_s2_diseases, threshold)
new_significant_s3_s4 = new_significant_diseases(s3_s4_diseases, s2_s3_diseases, threshold)

# Filter for more significant diseases
significant_threshold = 3.0
new_significant_s2_s3 = new_significant_s2_s3[new_significant_s2_s3['Score'] >= significant_threshold]
new_significant_s3_s4 = new_significant_s3_s4[new_significant_s3_s4['Score'] >= significant_threshold]

# Prepare data for line plots
def prepare_line_plot_data(new_significant_df, current_df, previous_df, stage_current, stage_previous):
    diseases = new_significant_df['Name'].tolist()
    previous_scores = previous_df[previous_df['Name'].isin(diseases)][['Name', 'Score']].rename(columns={'Score': f'Score_{stage_previous}'})
    current_scores = current_df[current_df['Name'].isin(diseases)][['Name', 'Score']].rename(columns={'Score': f'Score_{stage_current}'})
    merged_scores = previous_scores.merge(current_scores, on='Name', how='outer')
    return merged_scores

# Data for line plots
line_data_s2_s3 = prepare_line_plot_data(new_significant_s2_s3, s2_s3_diseases, s1_s2_diseases, 's2_s3', 's1_s2')
line_data_s3_s4 = prepare_line_plot_data(new_significant_s3_s4, s3_s4_diseases, s2_s3_diseases, 's3_s4', 's2_s3')

# Plot the trends using line plots
plt.figure(figsize=(14, 7))
for _, row in line_data_s2_s3.iterrows():
    plt.plot(['s1_s2', 's2_s3'], [row['Score_s1_s2'], row['Score_s2_s3']], marker='o', label=row['Name'])
plt.xlabel('Stage Comparison')
plt.ylabel('Score')
plt.title('Trend of Scores for New Significant Diseases in s1_s2 vs s2_s3')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)
plt.show()

In [None]:

plt.figure(figsize=(14, 7))
for _, row in line_data_s3_s4.iterrows():
    plt.plot(['s2_s3', 's3_s4'], [row['Score_s2_s3'], row['Score_s3_s4']], marker='o', label=row['Name'])
plt.xlabel('Stage Comparison')
plt.ylabel('Score')
plt.title('Trend of Scores for New Significant Diseases in s2_s3 vs s3_s4')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)
plt.show()

In [None]:
# Plot bar plots for significant new diseases
plt.figure(figsize=(10, 10))
sns.barplot(data=new_significant_s2_s3, x='Score', y='Name', palette='viridis')
plt.title('New Significant Diseases in s2_s3')
plt.xlabel('Score')
plt.ylabel('Disease Name')
plt.show()

In [None]:
plt.figure(figsize=(10, 20))
sns.barplot(data=new_significant_s3_s4, x='Score', y='Name', palette='viridis')
plt.title('New Significant Diseases in s3_s4')
plt.xlabel('Score')
plt.ylabel('Disease Name')
plt.show()