In [1]:
import pandas as pd
import gzip
import os

project_dir = '/Users/mrrobot/Projects_/EDA_dashboard_GEO_dataset/data'
data_dir = os.path.join(project_dir, 'raw')
count_matrix_file = os.path.join(data_dir, 'GSE99254_NSCLC.TCell.S12346.count.txt.gz')
counts_df = None


  from pandas.core import (


In [2]:
try:
    with gzip.open(count_matrix_file, 'rt') as f:
        temp_df = pd.read_csv(f, sep='\t')
        if 'symbol' in temp_df.columns:
            counts_df = temp_df.set_index('symbol')
            # Assuming the first column after 'symbol' (originally the first actual column) might be 'geneID'
            # Or if 'geneID' was the first column name in the file and 'symbol' the second:
            if 'geneID' in counts_df.columns and counts_df.columns[0] == 'geneID':
                 counts_df = counts_df.drop(columns=['geneID'])
            elif counts_df.index.name == 'geneID': # If geneID became index name from first col
                 pass # No action needed if 'symbol' successfully became index
            print("Successfully loaded count matrix with 'symbol' as index:")
            print(f"Shape: {counts_df.shape}")
            # print(counts_df.iloc[:5, :5]) # Show top-left corner
        else:
            # Fallback if 'symbol' isn't a column, maybe first column is symbol, second is geneID
            # This part might need adjustment based on actual file structure if 'symbol' isn't a header
            counts_df = pd.read_csv(count_matrix_file, sep='\t', index_col=0, compression='gzip')
            print("Loaded count matrix (assuming first column is gene symbol index):")
            print(f"Shape: {counts_df.shape}")
            # print(counts_df.iloc[:5, :5])


except FileNotFoundError:
    print(f"Error: Count matrix file not found at {count_matrix_file}")
except Exception as e:
    print(f"An error occurred loading count matrix: {e}")

Successfully loaded count matrix with 'symbol' as index:
Shape: (23459, 12346)


This is the most crucial step to unlock further analysis with what you have. The cell IDs in your counts_df columns (e.g., NTH10-0616A, TTC1-P0617) contain prefixes that indicate the sample type.

NTH: Normal Tissue (adjacent non-tumor, CD4+ T helper like)
NTC: Normal Tissue (adjacent non-tumor, CD8+ T cells)
TTC: Tumor Tissue (CD8+ T cells)
TTH: Tumor Tissue (CD4+ T helper like - assuming this exists, or a similar tumor CD4 prefix)
PTC: Peripheral Blood (CD8+ T cells)
PTH: Peripheral Blood (CD4+ T helper like)
(We might also see NTY, TTY, PTY for regulatory T cells based on initial GEO description)


In [3]:
if counts_df is not None:
    cell_ids = counts_df.columns.tolist()
    
    # Create a basic metadata dataframe
    basic_metadata_df = pd.DataFrame(index=cell_ids)
    
    # Extract sample type prefix (first 3 characters)
    basic_metadata_df['sample_type_prefix'] = [cell_id[:3] for cell_id in cell_ids]
    
    # Map prefixes to meaningful tissue origins
    # This mapping might need refinement based on exact prefixes in your cell IDs
    # and the GEO page description of sample types
    prefix_to_origin = {
    'NTH': 'Normal_Lung',      # Normal T-helper (CD4+)
    'NTC': 'Normal_Lung',      # Normal CD8+
    'NTY': 'Normal_Lung',      # Normal Treg (likely CD4+CD25+)
    'NTR': 'Normal_Lung',      # Normal Treg (alternative or similar to NTY)
    'NTS': 'Normal_Lung',      # Normal T-cell (Stimulated or other specific sort)
    'TTC': 'Tumor_Lung',       # Tumor CD8+
    'TTH': 'Tumor_Lung',       # Tumor T-helper (CD4+)
    'TTY': 'Tumor_Lung',       # Tumor Treg (likely CD4+CD25+)
    'TTR': 'Tumor_Lung',       # Tumor Treg (alternative or similar to TTY)
    'TTS': 'Tumor_Lung',       # Tumor T-cell (Stimulated or other specific sort)
    'PTC': 'Peripheral_Blood', # Peripheral CD8+
    'PTH': 'Peripheral_Blood', # Peripheral T-helper (CD4+)
    'PTY': 'Peripheral_Blood', # Peripheral Treg (likely CD4+CD25+)
    'PTR': 'Peripheral_Blood', # Peripheral Treg (alternative or similar to PTY)
    'PTS': 'Peripheral_Blood', # Peripheral T-cell (Stimulated or other specific sort)
    'T-C': 'Tumor_Lung'        # This is a bit ambiguous. Assuming "Tumor-Cell" or "Tumor-Cluster".
                               # If its origin is unclear after checking some cell IDs, 
                               # you might map it to 'Unknown' or investigate further.
                               # For now, if it's consistently from tumor samples, this is a starting point.
}
    basic_metadata_df['tissue_origin'] = basic_metadata_df['sample_type_prefix'].map(prefix_to_origin)
    
    # Extract patient ID (example: from 'NTH10-0616A', patient is '0616A' after the hyphen)
    # This is an assumption and might need adjustment based on all cell ID formats
    def extract_patient_id(cell_id):
        parts = cell_id.split('-')
        if len(parts) > 1:
            return parts[-1] # Takes the part after the last hyphen
        return None # Or some default
    # A more robust way might be to get the part after the first hyphen or based on a pattern
    # For Pxxxx IDs like in some of your cell names, we might need another rule.
    # For now, let's make a placeholder for patient ID.
    # A fuller solution requires inspecting all unique cell ID patterns.
    # The GEO page mentioned Patient IDs like P0617. These are embedded in cell names like 'TTC1-P0617'
    
    patient_ids = []
    for cell_id in cell_ids:
        if '-P' in cell_id: # For IDs like TTC1-P0617
            patient_ids.append(cell_id.split('-P')[1])
        elif '-' in cell_id: # For IDs like NTH10-0616A
            patient_ids.append(cell_id.split('-',1)[1]) # content after first hyphen
        else:
            patient_ids.append('Unknown')
    basic_metadata_df['patient_id'] = patient_ids


    print("\nBasic Per-Cell Metadata DataFrame created from cell IDs:")
    print(f"Shape: {basic_metadata_df.shape}")
    print(basic_metadata_df.head())
    print("\nUnique tissue origins found:")
    print(basic_metadata_df['tissue_origin'].value_counts(dropna=False))
    print("\nUnique sample type prefixes found:")
    print(basic_metadata_df['sample_type_prefix'].value_counts(dropna=False))
else:
    print("counts_df is not loaded, skipping metadata creation.")


Basic Per-Cell Metadata DataFrame created from cell IDs:
Shape: (12346, 3)
            sample_type_prefix tissue_origin patient_id
NTH10-0616A                NTH   Normal_Lung      0616A
NTH11-0616A                NTH   Normal_Lung      0616A
NTH15-0616A                NTH   Normal_Lung      0616A
NTH17-0616A                NTH   Normal_Lung      0616A
NTH2-0616A                 NTH   Normal_Lung      0616A

Unique tissue origins found:
tissue_origin
Tumor_Lung          5971
Peripheral_Blood    4260
Normal_Lung         2115
Name: count, dtype: int64

Unique sample type prefixes found:
sample_type_prefix
TTC    2182
TTH    1591
PTC    1323
PTH    1254
TTR    1100
NTC     934
TTY     892
PTR     849
PTY     672
NTH     655
NTY     238
PTS     162
NTR     149
NTS     139
T-C     136
TTS      70
Name: count, dtype: int64


In [4]:
# Define paths for processed data
processed_data_dir = os.path.join(project_dir, 'data', 'processed')
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir) # Create directory if it doesn't exist

if counts_df is not None:
    counts_df.to_csv(os.path.join(processed_data_dir, 'counts_matrix_raw.csv.gz'), compression='gzip')
    print(f"\nSaved raw counts_df to {os.path.join(processed_data_dir, 'counts_matrix_raw.csv.gz')}")

if basic_metadata_df is not None:
    basic_metadata_df.to_csv(os.path.join(processed_data_dir, 'cell_metadata_basic.csv'))
    print(f"Saved basic_metadata_df to {os.path.join(processed_data_dir, 'cell_metadata_basic.csv')}")



Saved raw counts_df to /Users/mrrobot/Projects_/EDA_dashboard_GEO_dataset/data/data/processed/counts_matrix_raw.csv.gz
Saved basic_metadata_df to /Users/mrrobot/Projects_/EDA_dashboard_GEO_dataset/data/data/processed/cell_metadata_basic.csv
