<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [3]</a>'.</span>

In [1]:
# Parameters
input_file = "analysis_results/counts/gene_counts.txt"
output_file = "analysis_results/scaled_count_selected_samples.csv"


In [2]:
from email.quoprimime import header_check
from numba.cuda.printimpl import print_item
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd



<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [3]:

# Load featureCounts output properly
counts = pd.read_csv(
    "gene_counts.txt",
    sep="\t",              # columns are tab-separated
    comment="#",           # skip the long featureCounts command line
    index_col=0            # Geneid column becomes index
)

# Drop the metadata columns (Chr, Start, End, Strand, Length)
counts = counts.drop(columns=["Chr", "Start", "End", "Strand", "Length"])

# Transpose so samples are rows and genes are columns
X = counts.T

# Ensure numeric
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)

print("Data shape:", X.shape)
print(X.head())
X.index = X.index.str.replace('_sorted.bam', '', regex=False)
print("\nAfter cleanup:")
print(X.head(2))
counts = X


FileNotFoundError: [Errno 2] No such file or directory: 'gene_counts.txt'

In [None]:

# Step 1: library size normalization (Counts Per Million - CPM)
counts_cpm = counts.div(counts.sum(axis=1), axis=0) * 1e6

# Step 2: log-transform to reduce skew
counts_log = np.log2(counts_cpm + 1)

# Save normalized version
counts_log.to_excel("gene_counts_normalized.xlsx")

print("✅ Normalized using log2(CPM + 1)")
print("✅ Exported successfully to counts/gene_counts_matrix.xlsx")
# Export the transposed count matrix (samples × genes)
counts.to_excel("gene_counts_matrix.xlsx")

print("✅ Exported successfully to counts/gene_counts_matrix.xlsx")

scaler = StandardScaler()
scaled_count = pd.DataFrame(scaler.fit_transform(counts_log),
                        index=counts_log.index,
                        columns=counts_log.columns)





In [None]:
print(scaled_count.head())


In [None]:
print(scaled_count.columns.to_list)


In [None]:
sample_ids = ['STM2739', 'STM2714', 'PSLT053', 'STM2697', 'STM2701', 'STM2728', 'STM2738', 'STM2236', 'STM2729', 'STM2718', 'STM2721', 'STM2723', 'STM2737', 'PSLT054', 'STM2706', 'STM2594', 'STM2715', 'PSLT052', 'STM2707', 'PSLT055', 'STM2239', 'STM2736', 'STM2439', 'STM2730', 'STM2695', 'STM2708', 'PSLT027', 'STM0294', 'STM2696', 'STM2606', 'PSLT061', 'STM1009', 'STM2722', 'STM2699', 'STM1032', 'PSLT102', 'STM4115', 'STM4524', 'STM2151', 'STM2240', 'STM2710', 'PSLT103', 'STM2631', 'PSLT067', 'STM3780', 'PSLT011', 'STM0561', 'PSLT108', 'STM1026', 'PSLT047', 'STM2598', 'STM2232', 'STM2087', 'STM3517', 'STM0762', 'STM4427', 'STM1023', 'STM2088', 'STM1051', 'STM2005', 'PSLT015', 'STM3697', 'STM0518', 'STM3343', 'STM2726', 'PSLT007', 'STM0331', 'STM4113', 'STM4204', 'STM0571', 'STM4210', 'PSLT031', 'STM2752', 'STM4216', 'STM2587', 'PSLT064', 'STM0574', 'STM4199', 'STM2732', 'STM0329', 'STM2632', 'STM1332', 'PSLT075', 'STM1050', 'STM1029', 'STM2091', 'PSLT092', 'STM2753', 'STM4033', 'PSLT081', 'PSLT089', 'STM0912', 'STM2635', 'PSLT077', 'STM2759', 'PSLT004', 'STM3522', 'STM3637', 'STM0554', 'STM2719', 'PSLT046', 'STM0291', 'STM3277', 'PSLT056', 'STM4112', 'STM2086', 'STM3169', 'STM4433', 'PSLT069', 'STM4205', 'STM2703', 'STM0530', 'PSLT017', 'STM0916', 'STM2233', 'STM1240', 'STM0676', 'STM4417', 'STM0195', 'STM4157', 'PSLT036', 'STM0332', 'PSLT033', 'PSLT100', 'STM4202', 'STM2092', 'STM2705', 'STM2614', 'STM2008', 'STM2608', 'PSLT110', 'PSLT028', 'STM3080', 'PSLT073', 'STM0922', 'PSLT057', 'STM0527', 'STM2085', 'STM4110', 'STM0556', 'STM0572', 'STM1010', 'STM3639', 'STM0197', 'STM1376', 'STM0323', 'PSLT021', 'STM3782', 'STM2716', 'STM3291', 'STM2097', 'STM4201', 'PSLT003', 'STM4421', 'STM0927', 'PSLT051', 'PSLT095', 'STM2605', 'STM1025', 'PSLT050', 'STM0659', 'PSLT079', 'PSLT072', 'STM3784', 'PSLT098', 'STM3783', 'STM3035', 'STM4214', 'PSLT091', 'STM2702', 'PSLT060', 'STM2094', 'STM3519', 'STM2000', 'PSLT016', 'STM2591', 'STM0198', 'STM0658', 'STM2735', 'PSLT063', 'PSLT094', 'STM0918', 'STM3079', 'STM2903', 'PSLT080', 'STM2764', 'STM4213', 'STM1048', 'STM2616', 'PSLT045', 'PSLT049', 'STM0893', 'STM1030', 'STM3653', 'PSLT087', 'STM4420', 'STM0573', 'PSLT012', 'STM0055', 'STM0559']

In [None]:
# Filter scaled_count to keep only columns in your list
scaled_count_filtered = scaled_count.loc[:, scaled_count.columns.intersection(sample_ids)]

# Save to CSV
scaled_count_filtered.to_csv("scaled_count_selected_samples.csv")
print(f"✅ Saved {scaled_count_filtered.shape[1]} columns from your list.")


In [None]:
scaled_count_filtered.columns.to_list()

In [None]:
scaled_count_filtered.head()