In [2]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from scipy.stats import ttest_ind
from IPython.display import display

In [3]:
early_file = '/Users/hershit.rustagi/Downloads/early_stage_gene_expression.csv'
late_file = '/Users/hershit.rustagi/Downloads/late_stage_gene_expression.csv'
normal_file = '/Users/hershit.rustagi/Downloads/normal_gene_expression.csv'

df_early_raw = pd.read_csv(early_file, index_col=0)
df_late_raw = pd.read_csv(late_file, index_col=0)
df_normal_raw = pd.read_csv(normal_file, index_col=0)

In [4]:
for label, df in zip(['Early', 'Late', 'Normal'], [df_early_raw, df_late_raw, df_normal_raw]):
    if df.shape[0] > df.shape[1]:
        print(f"🔄 Transposing {label} data")
        if label == 'Early':
            df_early_raw = df.T
        elif label == 'Late':
            df_late_raw = df.T
        else:
            df_normal_raw = df.T

df_early = df_early_raw.select_dtypes(include=[np.number])
df_late = df_late_raw.select_dtypes(include=[np.number])
df_normal = df_normal_raw.select_dtypes(include=[np.number])

🔄 Transposing Early data
🔄 Transposing Late data
🔄 Transposing Normal data


In [5]:
def calculate_differential_expression(df_condition, df_normal, condition_name):
    if df_condition.shape[1] != df_normal.shape[1]:
        raise ValueError("Mismatch in number of genes.")
    
    genes = df_condition.columns
    mean_cond = df_condition.mean(axis=0)
    mean_normal = df_normal.mean(axis=0)
    log2fc = np.log2((mean_cond + 1e-6) / (mean_normal + 1e-6))
    
    p_values = []
    for gene in genes:
        try:
            stat, p = ttest_ind(df_condition[gene], df_normal[gene], equal_var=False, nan_policy='omit')
        except Exception:
            p = np.nan
        p_values.append(p)

    result = pd.DataFrame({
        'Gene': genes,
        f'Mean_{condition_name}': mean_cond.values,
        'Mean_Normal': mean_normal.values,
        f'Log2_Fold_Change_{condition_name}': log2fc.values,
        f'P_Value_{condition_name}': p_values
    })
    
    result.dropna(subset=[f'P_Value_{condition_name}'], inplace=True)
    result[f'Significant_{condition_name}'] = (
        ((result[f'Log2_Fold_Change_{condition_name}'] > 1) & (result[f'P_Value_{condition_name}'] < 0.05)) |
        ((result[f'Log2_Fold_Change_{condition_name}'] < -1) & (result[f'P_Value_{condition_name}'] < 0.05))
    )
    
    return result

In [6]:
# ---------------- Run Analysis Summary + Publish Top Genes ----------------
try:
    # EARLY vs NORMAL
    print("\n🔬 Running Early vs Normal Analysis...\n")
    early_results = calculate_differential_expression(df_early, df_normal, 'Early')

    print("\n✅ Top 10 Genes with Log2FC > 1 or < -1 and P-Value < 0.05 (Early):")
    top10_early_significant_genes = early_results[early_results['Significant_Early'] == True] \
        .sort_values(by='P_Value_Early') \
        .head(100)
    display(top10_early_significant_genes[['Gene', 'Log2_Fold_Change_Early', 'P_Value_Early']])

    # LATE vs NORMAL
    print("\n🔬 Running Late vs Normal Analysis...\n")
    late_results = calculate_differential_expression(df_late, df_normal, 'Late')

    print("\n✅ Top 10 Genes with Log2FC > 1 or < -1 and P-Value < 0.05 (Late):")
    top10_late_significant_genes = late_results[late_results['Significant_Late'] == True] \
        .sort_values(by='P_Value_Late') \
        .head(100)
    display(top10_late_significant_genes[['Gene', 'Log2_Fold_Change_Late', 'P_Value_Late']])

    # 🧬 Combine Early + Late top 10 into a single list
    top_genes_combined = list(
        set(top10_early_significant_genes['Gene']).union(set(top10_late_significant_genes['Gene']))
    )[:100]

    print(f"\n🔝 Final Combined Top 10 Genes from Early/Late Analyses:\n{top_genes_combined}")

except Exception as e:
    print(f"❌ Error: {e}")


🔬 Running Early vs Normal Analysis...


✅ Top 10 Genes with Log2FC > 1 or < -1 and P-Value < 0.05 (Early):


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,Gene,Log2_Fold_Change_Early,P_Value_Early
19116,canctyp,19.931570,0.000000e+00
2956,LGI3,-1.378339,2.108652e-148
12395,CNTN6,-1.471978,1.270197e-142
13177,GPM6A,-1.249001,6.067808e-112
15803,RS1,-1.632851,2.066936e-104
...,...,...,...
2912,EPYC,2.689454,2.662058e-48
2420,PTPRH,1.602916,3.063599e-48
8473,C6orf126,4.372324,4.072227e-48
6116,CDC25C,1.443946,5.243554e-48



🔬 Running Late vs Normal Analysis...


✅ Top 10 Genes with Log2FC > 1 or < -1 and P-Value < 0.05 (Late):


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,Gene,Log2_Fold_Change_Late,P_Value_Late
19116,canctyp,20.931569,0.000000e+00
16588,SGCG,-1.727535,1.667709e-66
15767,C10orf67,-1.957882,9.162859e-66
15803,RS1,-1.722170,3.544363e-65
13177,GPM6A,-1.332833,5.258891e-60
...,...,...,...
5089,MND1,1.424401,3.057802e-38
12694,SYT12,1.500582,3.814931e-38
17545,SH3GL2,-1.530504,4.806491e-38
3934,COL11A1,1.858244,5.377216e-38



🔝 Final Combined Top 10 Genes from Early/Late Analyses:
['NRG3', 'CCDC141', 'PLEKHN1', 'RXFP1', 'HTR3A', 'GRIK4', 'KIAA0408', 'ITPKA', 'RSPO2', 'AKR7A3', 'OVCH2', 'ADCY8', 'IL1RAPL2', 'C1orf182', 'MELK', 'ZNF695', 'RPL13AP17', 'ABCA12', 'FABP4', 'LOC100131726', 'WNT7A', 'DLGAP5', 'NEIL3', 'GNGT1', 'PPAPDC1A', 'CST4', 'HIST1H2AM', 'RASAL1', 'NUF2', 'TRHDE', 'FIGF', 'PRAME', 'canctyp', 'MS4A15', 'LOC399815', 'C13orf36', 'LHFPL5', 'ITLN2', 'NETO1', 'PTPRQ', 'LGI3', 'LOC84740', 'TCEAL2', 'ODAM', '?|729884', 'LHFPL3', 'FAM150B', 'CD5L', 'ERVFRDE1', 'CPB2', 'ADH1A', 'SOSTDC1', 'ONECUT1', 'KIF14', 'CENPA', 'CDC45', 'CA4', 'AGBL1', 'CENPI', 'GYPE', 'SLC6A13', 'MYOC', 'SH3GL2', 'FOXI3', 'F11', 'CLDN18', 'MMP11', 'RAD54L', 'UPK3B', 'SGCG', 'ACADL', 'WNT3A', 'SLC6A4', 'PTPRH', 'RSPO1', 'C1orf65', 'TROAP', 'SYT12', 'EPYC', 'SCUBE1', 'HBA1', 'CDC25C', 'LOC283392', 'CAV3', 'MAPK4', 'SH3GL3', 'ARSH', 'GPM6A', 'RS1', 'DEPDC1', 'DVWA', 'LOC149620', 'RXRG', 'KHDRBS2', 'C11orf86', 'COL11A1', 'TFR2', 'C1

In [None]:
# ---------------- Gemini Integration ----------------
import google.generativeai as genai
from IPython.display import display, Markdown

# 🧠 Class to query Gemini with top 10 genes
class GeminiAutoQuery:
    def __init__(self, genes):
        self.genes = genes[:10]  # ✅ Ensure only top 10 used
        self._load_and_configure_api()
        self.chat = self._initialize_model()

    def _load_and_configure_api(self):
        api_key = "AIzaSyA-TsTWcAYrMb4mck9dTelfX0SlwBTEKhg"
        genai.configure(api_key=api_key)

    def _initialize_model(self):
        model = genai.GenerativeModel("gemini-2.0-flash")
        return model.start_chat(history=[])

    def _build_prompt(self):
        gene_list = ", ".join(self.genes)
        return (
            f"The following are the top 10 significantly differentially expressed genes based on log2 fold change and p-values: {gene_list}.\n\n"
            "1. For each of these genes, list the major drugs that target them.\n"
            "2. For every gene-drug pair, explain in detail:\n"
            "   - How the drug interacts with the gene or its protein product.\n"
            "   - The biological mechanism of action (e.g., inhibition, activation, mutation-specific effect).\n"
            "3. For each gene individually, explain its molecular function.\n"
            "4. After explaining each gene-drug interaction, summarize how these therapies are combined in actual treatment plans.\n"
            "5. How do these therapies vary based on cancer stage or patient profile?\n"
            "6. What are the available medical treatments for cancers associated with these genes?\n"
        )

    def ask_gemini(self):
        prompt = self._build_prompt()
        try:
            response = self.chat.send_message(prompt)
            display(Markdown(f"### 🧠 Gemini's Response for Top Genes:\n\n{response.text}"))
        except Exception as e:
            print(f"❌ Gemini API Error: {str(e)}")

# 🔁 Run Gemini query on top 10 genes from previous cell
gq = GeminiAutoQuery(top_genes_combined)
gq.ask_gemini()

In [8]:
pathway_df = pd.read_csv('/Users/hershit.rustagi/Desktop/copy.csv')  # Your file
pathway_df = pathway_df[['DB_Object_Symbol', 'GO_ID']].dropna().drop_duplicates()
pathway_df.columns = ['Gene', 'Pathway']

In [9]:
try:
    print("\n🔬 Running Early vs Normal Analysis...\n")
    early_results = calculate_differential_expression(df_early, df_normal, 'Early')
    sig_early_genes = early_results[early_results['Significant_Early']].sort_values(
        by='P_Value_Early'
    )['Gene'].tolist()

    print("\n🔬 Running Late vs Normal Analysis...\n")
    late_results = calculate_differential_expression(df_late, df_normal, 'Late')
    sig_late_genes = late_results[late_results['Significant_Late']].sort_values(
        by='P_Value_Late'
    )['Gene'].tolist()

    all_sig_genes = list(set(sig_early_genes + sig_late_genes))
    top_100_genes = all_sig_genes[:11]
    print(f"\n🧬 Top 100 Significant Genes: {len(top_100_genes)}")

    matched_pathways = pathway_df[pathway_df['Gene'].isin(top_100_genes)]

    # Group by gene and join multiple GO terms into a single string
    grouped_pathways = matched_pathways.groupby('Gene')['Pathway'].apply(
        lambda x: ', '.join(sorted(set(x)))
    ).reset_index()

    print("\n📊 Top 100 Gene-to-Pathway Mapping (Grouped):")
    display(grouped_pathways)

except Exception as e:
    print(f"❌ Error: {e}")


🔬 Running Early vs Normal Analysis...



  res = hypotest_fun_out(*samples, **kwds)



🔬 Running Late vs Normal Analysis...


🧬 Top 100 Significant Genes: 11

📊 Top 100 Gene-to-Pathway Mapping (Grouped):


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,Gene,Pathway
0,ARG1,"GO:0000050, GO:0002250, GO:0002376, GO:0004053..."
1,CAPZA3,"GO:0003779, GO:0005737, GO:0005829, GO:0005856..."
2,GPR17,"GO:0002862, GO:0004930, GO:0004950, GO:0005886..."
3,GPR78,"GO:0004930, GO:0005515, GO:0005886, GO:0007165..."
4,GRIK4,"GO:0005216, GO:0005886, GO:0006811, GO:0007215..."
5,PLEKHN1,"GO:0001666, GO:0001786, GO:0005515, GO:0005739..."
6,SPATA21,"GO:0005509, GO:0046872"
7,SYT6,"GO:0000149, GO:0005544, GO:0005737, GO:0005829..."
8,TMEM145,"GO:0007186, GO:0016020, GO:0019236"
9,TRIM31,"GO:0002376, GO:0005515, GO:0005737, GO:0005739..."


In [11]:
def fetch_go_name(go_id):
    url = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_id}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        results = response.json()
        return results['results'][0]['name']
    except Exception as e:
        print(f"❌ Error for {go_id}: {e}")
        return None

In [12]:
go_name_cache = {}

def cached_fetch_go_name(go_id):
    if go_id in go_name_cache:
        return go_name_cache[go_id]
    name = fetch_go_name(go_id)
    go_name_cache[go_id] = name
    return name

tqdm.pandas(desc="🔍 Fetching GO Names")
matched_pathways['Pathway_Name'] = matched_pathways['Pathway'].progress_apply(cached_fetch_go_name)


🔍 Fetching GO Names: 100%|██████████| 144/144 [01:23<00:00,  1.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_pathways['Pathway_Name'] = matched_pathways['Pathway'].progress_apply(cached_fetch_go_name)


In [13]:
matched_pathways_clean = matched_pathways.dropna(subset=["Pathway_Name"])

In [14]:
# Combine multiple rows per gene into one line
grouped_pathways = matched_pathways_clean.groupby('Gene')['Pathway_Name'].apply(lambda x: ', '.join(sorted(set(x)))).reset_index()

# Ensure full column display in notebook
pd.set_option('display.max_colwidth', None)

# Then display the full grouped data again
display(Markdown("### 🧬 Genes with All Associated Pathways (Full View)"))
display(grouped_pathways.head(1000))

### 🧬 Genes with All Associated Pathways (Full View)

Unnamed: 0,Gene,Pathway_Name
0,ARG1,"adaptive immune response, arginase activity, arginine catabolic process, arginine metabolic process, azurophil granule lumen, cytoplasm, cytosol, defense response to protozoan, extracellular region, extracellular space, hydrolase activity, hydrolase activity, acting on carbon-nitrogen (but not peptide) bonds, in linear amidines, immune system process, innate immune response, manganese ion binding, metal ion binding, negative regulation of T cell proliferation, negative regulation of T-helper 2 cell cytokine production, negative regulation of activated T cell proliferation, negative regulation of type II interferon-mediated signaling pathway, obsolete arginine catabolic process to ornithine, positive regulation of neutrophil mediated killing of fungus, protein binding, response to nematode, specific granule lumen, urea cycle"
1,CAPZA3,"F-actin capping protein complex, actin binding, actin cytoskeleton organization, actin filament binding, actin filament capping, actin filament organization, barbed-end actin filament capping, cortical cytoskeleton, cytoplasm, cytoplasm organization, cytoskeleton, cytosol, membrane, sperm head, spermatid development"
2,GPR17,"G protein-coupled receptor activity, G protein-coupled receptor signaling pathway, chemokine receptor activity, chemokine-mediated signaling pathway, membrane, negative regulation of inflammatory response to antigenic stimulus, oligodendrocyte differentiation, plasma membrane, receptor serine/threonine kinase binding, signal transduction"
3,GPR78,"G protein-coupled receptor activity, G protein-coupled receptor signaling pathway, adenylate cyclase-activating G protein-coupled receptor signaling pathway, membrane, plasma membrane, protein binding, signal transduction"
4,GRIK4,"cell projection, chemical synaptic transmission, glutamate receptor signaling pathway, hippocampal mossy fiber to CA3 synapse, ionotropic glutamate receptor signaling pathway, kainate selective glutamate receptor activity, kainate selective glutamate receptor complex, ligand-gated monoatomic ion channel activity, ligand-gated monoatomic ion channel activity involved in regulation of presynaptic membrane potential, membrane, modulation of chemical synaptic transmission, monoatomic ion channel activity, monoatomic ion transmembrane transport, monoatomic ion transport, plasma membrane, postsynaptic density membrane, postsynaptic membrane, presynaptic membrane, regulation of postsynaptic membrane potential, regulation of presynaptic membrane potential, signaling receptor activity, synapse, synaptic transmission, glutamatergic, transmitter-gated monoatomic ion channel activity involved in regulation of postsynaptic membrane potential"
5,PLEKHN1,"3'-UTR-mediated mRNA destabilization, cardiolipin binding, cytoskeleton, cytosol, membrane, mitochondrial membrane, mitochondrion, phosphatidic acid binding, phosphatidylinositol phosphate binding, phosphatidylserine binding, plasma membrane, positive regulation of apoptotic process, protein binding, response to hypoxia"
6,SPATA21,"calcium ion binding, metal ion binding"
7,SYT6,"SNARE binding, acrosomal vesicle exocytosis, calcium ion sensor activity, calcium-dependent phospholipid binding, chemical synaptic transmission, clathrin binding, cytoplasm, cytoplasmic side of plasma membrane, cytoplasmic vesicle, cytosol, exocytic vesicle, membrane, metal ion binding, perinuclear endoplasmic reticulum, plasma membrane, protein homodimerization activity, regulation of calcium ion-dependent exocytosis, synapse, synaptic vesicle membrane, syntaxin binding, vesicle-mediated transport"
8,TMEM145,"G protein-coupled receptor signaling pathway, membrane, response to pheromone"
9,TRIM31,"antiviral innate immune response, cytoplasm, cytosol, defense response to virus, host-mediated suppression of symbiont invasion, immune system process, inflammatory response, innate immune response, metal ion binding, mitochondrion, negative regulation of NLRP3 inflammasome complex assembly, negative regulation of viral transcription, positive regulation of DNA-templated transcription, protein K48-linked ubiquitination, protein K63-linked ubiquitination, protein binding, protein ubiquitination, transferase activity, ubiquitin protein ligase activity, ubiquitin-dependent protein catabolic process, viral release from host cell, zinc ion binding"
