In [None]:
# These libraries are usually pre-installed in Colab, but just in case
!pip install seaborn matplotlib pandas
!pip install mygene  # Install the mygene package

In [None]:
from google.colab import files
import pandas as pd

# Upload the file
uploaded = files.upload()
filename = next(iter(uploaded))

# Read the file as UTF-16 and fix the BOM
df = pd.read_csv(filename, sep='\t', encoding='utf-16')

# Clean up column names (remove BOM if it snuck into 'ENSEMBL_ID')
df.columns = [col.strip().replace('\ufeff', '') for col in df.columns]

# Set ENSEMBL_ID as the index
df.set_index('ENSEMBL_ID', inplace=True)

# Preview
df.head()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
from biothings_client import get_client

# 1. Upload file
uploaded = files.upload()
filename = next(iter(uploaded))

# 2. Read with correct encoding and clean columns
df = pd.read_csv(filename, sep='\t', encoding='utf-16')
df.columns = [col.strip().replace('\ufeff', '') for col in df.columns]
df.set_index('ENSEMBL_ID', inplace=True)

# 3. Define sample columns
all_samples = ['Control_1', 'Control_2', 'Control_3', 'SOX2KO_1', 'SOX2KO_2', 'SOX2KO_3']

# 4. Your gene list (SYMBOLS)
genes_of_interest = ['SOX2', 'FOXA1', 'AR', 'CDK14', 'CDK1', 'ROR1', 'ROR2', 'WNT5A', 'WNT2', 'SEMA3C', 'FGF5', 'IGF1', 'FGFR2','NKX2-1', 'NGF', 'ASCL1', 'ASCL2', 'POU3F2', 'INSM1', 'FOXA2', 'CREB1', 'CREB5', 'DLL1', 'JAK1', 'TLE1', 'NRARP', 'NOTCH1', 'ESRRG', 'COLCA1', 'POU2AF3', 'EHF', 'ERG']

# 5. Convert to Ensembl IDs
mg = get_client('gene')

def get_ensembl_ids(gene_names):
    gene_info = mg.querymany(gene_names, scopes='symbol', fields='ensembl.gene', species='human')
    ensembl_ids = {}
    for gene in gene_info:
        query = gene['query']
        if 'ensembl' in gene:
            if isinstance(gene['ensembl'], list):
                ensembl_ids[query] = gene['ensembl'][0]['gene']
            elif isinstance(gene['ensembl'], dict):
                ensembl_ids[query] = gene['ensembl']['gene']
    return ensembl_ids

ensembl_ids_dict = get_ensembl_ids(genes_of_interest)
print(ensembl_ids_dict)

# 6. Filter matrix by valid Ensembl IDs
valid_genes = {symbol: eid for symbol, eid in ensembl_ids_dict.items() if eid in df.index}
expression_data = df.loc[valid_genes.values(), all_samples]
expression_data.index = [k for k, v in valid_genes.items()]

# 7. Normalize each row to its own min-max
normalized_data = expression_data.sub(expression_data.min(axis=1), axis=0)
normalized_data = normalized_data.div(expression_data.max(axis=1) - expression_data.min(axis=1), axis=0)

# 8. Plot heatmap (normalized TPM per gene row)
plt.figure(figsize=(8, 5))
ax = sns.heatmap(
    normalized_data,
    annot=False,
    cmap='RdBu_r',
    cbar_kws={'label': 'Relative Expression (Min–Max)', 'ticks': []},
    xticklabels=all_samples
)

# Rotate the x-axis labels by 45 degrees to prevent overlap
plt.xticks(rotation=90, ha='right')

# Remove tick labels from color bar
colorbar = ax.collections[0].colorbar
colorbar.ax.set_yticklabels([])  # No tick labels on color bar
colorbar.ax.tick_params(size=0)  # Remove tick marks

# Adjust aspect ratio to make cells square
ax.set_aspect('equal', adjustable='box')

# Italicize gene names on the y-axis
plt.yticks(fontstyle='italic')

plt.title('CWR-R1 Normalized Gene Expression')
plt.tight_layout()

# Save and download
plt.savefig("normalized_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()
#files.download("normalized_heatmap.png")