## Export Design Matrix and Aligned Expression Matrix 

In [1]:
# Option 1: Extract expression matrix from Series Matrix file (if available)
#download the matrix expression file GSE288708_series_matrix.txt.gz"
import requests

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE288nnn/GSE288708/matrix/GSE288708_series_matrix.txt.gz"
output_path = "../data/raw/GSE288708_series_matrix.txt.gz"

response = requests.get(url)
if response.status_code == 200:
    with open(output_path, "wb") as f:
        f.write(response.content)
    print("‚úÖ Downloaded series matrix file.")
else:
    print(f"‚ùå Failed to download. Status code: {response.status_code}")



‚úÖ Downloaded series matrix file.


In [2]:
#export expression matrix from the series matrix file

import pandas as pd
import gzip

# Path to your downloaded series matrix file
matrix_path = "../data/raw/GSE288708_series_matrix.txt.gz"

# Step 1: Find where the expression table begins and ends
with gzip.open(matrix_path, 'rt') as f:
    lines = f.readlines()

start, end = None, None
for i, line in enumerate(lines):
    if line.startswith("!series_matrix_table_begin"):
        start = i + 1
    elif line.startswith("!series_matrix_table_end"):
        end = i
        break

# Step 2: Load the expression matrix section into a DataFrame
expr_df = pd.read_csv(
    matrix_path,
    sep="\t",
    skiprows=start,
    nrows=end - start - 1,
    index_col=0,
    compression='gzip'
)

# Step 3: Clean column names (remove quotes if present)
expr_df.columns = expr_df.columns.str.replace('"', '')
expr_df.index.name = "Gene"

# Step 4: Save to processed directory
expr_df_path = "../data/processed/expression_matrix.csv"
expr_df.to_csv(expr_df_path)

print(f"‚úÖ Expression matrix saved to: {expr_df_path}")
print(f"üß¨ Matrix shape: {expr_df.shape}")
expr_df.head()





‚úÖ Expression matrix saved to: ../data/processed/expression_matrix.csv
üß¨ Matrix shape: (0, 20)


Unnamed: 0_level_0,GSM8773456,GSM8773457,GSM8773458,GSM8773459,GSM8773460,GSM8773461,GSM8773462,GSM8773463,GSM8773464,GSM8773465,GSM8773466,GSM8773467,GSM8773468,GSM8773469,GSM8773470,GSM8773471,GSM8773472,GSM8773473,GSM8773474,GSM8773475
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1


In [3]:
#download combined DESeq2-ready matrix from the supplementary file

import urllib.request
import gzip
import shutil

# URL from GEO page
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE288nnn/GSE288708/suppl/GSE288708_all_sample_deseq.csv.gz"

# Paths
gz_path = "../data/raw/GSE288708_all_sample_deseq.csv.gz"
csv_path = "../data/raw/GSE288708_all_sample_deseq.csv"

# Download the gzipped file
urllib.request.urlretrieve(url, gz_path)

# Unzip
with gzip.open(gz_path, 'rb') as f_in:
    with open(csv_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File downloaded and extracted to:", csv_path)
print(csv_path)


File downloaded and extracted to: ../data/raw/GSE288708_all_sample_deseq.csv
../data/raw/GSE288708_all_sample_deseq.csv


In [4]:
#Make the first column (gene id) as the row index) and save data frame as csv
import pandas as pd
expr_df = pd.read_csv(csv_path, index_col=0)
print(expr_df.shape)
print(expr_df)
expr_df.describe()
expr_df.to_csv("../data/processed/expression_matrix1.csv")

(62266, 20)
              CH1   CH2   CH3   CH4   CH5   CN1   CN2   CN3   CN4   CN5  IRH1  \
gene                                                                            
5_8S_rRNA       0     0     0     0     0     0     0     0     0     0     0   
5_8S_rRNA_2     0     0     0     0     0     0     0     0     0     0     0   
5_8S_rRNA_3     0     0     0     0     0     0     0     0     0     0     0   
5_8S_rRNA_4     0     0     0     0     0     0     0     0     0     0     0   
5_8S_rRNA_6     0     0     0     0     0     0     0     0     0     0     0   
...           ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
ZYG11B       2468  2442  2464  2669  2545  2975  2606  2405  2394  2514  2558   
ZYX          4154  4265  4635  5529  3952  6375  6708  6581  5993  6149  4772   
ZYXP1           0     0     0     0     0     0     0     0     0     0     0   
ZZEF1        1735  1312  1382  1291  1352  1522  1835  1651  1512  1763  1653   
ZZZ3         103

In [5]:

#Align the sample meta data and the expression matrix to make the design matrix and the aligned expression matrix for downstream DE analysis

import pandas as pd
import os

# Paths (adjust if needed)
meta_path = "../data/processed/sample_metadata.csv"
expr_path = "../data/processed/expression_matrix1.csv"   # the file you already loaded
out_expr_path = "../data/processed/expression_matrix_aligned.csv"
out_design_path = "../data/processed/design_matrix.csv"

# 1) Load files
meta = pd.read_csv(meta_path)
expr = pd.read_csv(expr_path, index_col=0)

# Quick sanity
print("metadata shape:", meta.shape)
print("expr shape:", expr.shape)
print("expr columns (first 10):", expr.columns.tolist()[:10])
print("meta samples (first 10):", meta['Sample'].tolist()[:10])

# 2) Build code -> GSM mapping
# we assume Title starts with the short code like "CH1 - Control Hypoxia" or "IRN2 - ..."
# Extract the short code as first token of Title (split by whitespace)
meta['ShortCode'] = meta['Title'].apply(lambda t: str(t).split()[0].strip())
print("New column added to sample meta data i-e ShortCode:",meta.columns.tolist())
print(meta)

# Verify uniqueness
if meta['ShortCode'].duplicated().any():
    raise ValueError("Duplicate short codes detected in metadata! Check 'Title' parsing.")

# Build dict: short code -> GSM
code2gsm = dict(zip(meta['ShortCode'], meta['Sample']))
print("Example mapping (first 10):", list(code2gsm.items())[:10])
print("Example mapping in the form of dictionary:", code2gsm)

# 3) Check for missing/extra columns in expr
expr_cols = list(expr.columns.astype(str))
print("Columns in the expression matrix:", expr_cols)
codes_in_expr = set(expr_cols)
print("Columns in the expression matrix (set):", codes_in_expr)
codes_in_meta = set(code2gsm.keys())
print("Elements in the Short Code column in sample_metadata (set)/ keys of the dictionary:",codes_in_meta)

missing_in_expr = codes_in_meta - codes_in_expr
extra_in_expr = codes_in_expr - codes_in_meta

print("codes in metadata but NOT in expression matrix:", missing_in_expr)
print("codes in expression matrix but NOT in metadata (extra):", extra_in_expr)

if missing_in_expr:
    raise ValueError(f"These codes are in metadata but not in expression matrix: {missing_in_expr}")

# 4) Rename expr columns from short code -> GSM IDs
rename_map = {code: code2gsm[code] for code in expr_cols if code in code2gsm}
print(rename_map)
expr_renamed = expr.rename(columns=rename_map)

# 5) Reorder columns to match meta['Sample'] exactly
# ensure every Sample in meta exists now as a column
meta_samples = meta['Sample'].tolist()
missing_cols_after_rename = [s for s in meta_samples if s not in expr_renamed.columns]
if missing_cols_after_rename:
    raise ValueError(f"The following GSMs from metadata are missing in expression after rename: {missing_cols_after_rename}")

expr_aligned = expr_renamed[meta_samples]

# quick checks
assert list(expr_aligned.columns) == meta_samples, "Column order mismatch after alignment!"
print("Aligned expression shape:", expr_aligned.shape)

# 6) Save outputs
os.makedirs("../data/processed", exist_ok=True)
expr_aligned.to_csv(out_expr_path)
meta[['Sample','Treatment','Oxygen','Replicate','Group']].to_csv(out_design_path, index=False)

print("‚úÖ Saved aligned expression to:", out_expr_path)
print("‚úÖ Saved design matrix to:", out_design_path)

e = pd.read_csv("../data/processed/expression_matrix_aligned.csv", index_col=0)
d = pd.read_csv("../data/processed/design_matrix.csv", index_col=0)

print("First few samples in design matrix:", d.index[:5])
print("First few columns in expression:", e.columns[:5])

print(e.shape, d.shape)
print(list(e.columns)[:5], list(d.index)[:5])

print(expr_aligned)
print(f"‚úÖ Saved aligned expression: {out_expr_path} ({expr_aligned.shape[0]} genes √ó {expr_aligned.shape[1]} samples)")



metadata shape: (20, 6)
expr shape: (62266, 20)
expr columns (first 10): ['CH1', 'CH2', 'CH3', 'CH4', 'CH5', 'CN1', 'CN2', 'CN3', 'CN4', 'CN5']
meta samples (first 10): ['GSM8773456', 'GSM8773457', 'GSM8773458', 'GSM8773459', 'GSM8773460', 'GSM8773461', 'GSM8773462', 'GSM8773463', 'GSM8773464', 'GSM8773465']
New column added to sample meta data i-e ShortCode: ['Sample', 'Title', 'Treatment', 'Oxygen', 'Replicate', 'Group', 'ShortCode']
        Sample                              Title          Treatment  \
0   GSM8773456              CH1 - Control Hypoxia            Control   
1   GSM8773457              CH2 - Control Hypoxia            Control   
2   GSM8773458              CH3 - Control Hypoxia            Control   
3   GSM8773459              CH4 - Control Hypoxia            Control   
4   GSM8773460              CH5 - Control Hypoxia            Control   
5   GSM8773461             CN1 - Control Normoxia            Control   
6   GSM8773462             CN2 - Control Normoxia       

In [6]:
import pandas as pd
import os

# ===== Paths =====
input_path = r"..\data\processed\expression_matrix_aligned.csv"
output_path = r"..\data\processed\expression_matrix_cleaned.csv"

# ===== Step 1 ‚Äî Load expression matrix =====
print(f"üìÇ Loading expression matrix from: {input_path}")
df = pd.read_csv(input_path, index_col=0)  # Keep gene names as index
print(f"‚úÖ Original shape: {df.shape}")

# ===== Step 2 ‚Äî Remove rows with any zero values =====
df = df[(df != 0).all(axis=1)]

# ===== Step 3 ‚Äî Remove columns with any zero values =====
df = df.loc[:, (df != 0).all(axis=0)]

print(f"üßπ Cleaned shape: {df.shape}")

# ===== Step 4 ‚Äî Save cleaned matrix =====
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path)

print(f"üíæ Cleaned expression matrix saved to: {output_path}")


üìÇ Loading expression matrix from: ..\data\processed\expression_matrix_aligned.csv
‚úÖ Original shape: (62266, 20)
üßπ Cleaned shape: (17571, 20)
üíæ Cleaned expression matrix saved to: ..\data\processed\expression_matrix_cleaned.csv


In [7]:
import pandas as pd

df = pd.read_csv(r"..\data\processed\expression_matrix_cleaned.csv")
if (df == 0).any().any():
    print("Has zeros")
else:
    print("No zeros")


No zeros
