In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os

# --- Step 1: Data Loading ---
# Load the dataset from the CSV file.
try:
    df = pd.read_csv('sample.csv')
except FileNotFoundError:
    print("Error: 'sample.csv' not found. Please make sure the file is in the correct directory.")
    exit()

# Print the initial shape of the dataframe.
print(f"Initial dataframe shape: {df.shape}")
print("-" * 40)

# --- Step 2: Separate Methylation Values and P-values ---
# Select columns containing methylation values (beta-values).
# We use a regular expression to match columns starting with 'REFERENCE_SAMPLE'.
methylation_df = df.filter(regex='^REFERENCE_SAMPLE')
methylation_df.set_index(df['ID_REF'], inplace=True)

# Select columns containing detection p-values.
# We use a regular expression to match columns starting with 'Detection Pval'.
pvalue_df = df.filter(regex='^Detection Pval')
pvalue_df.set_index(df['ID_REF'], inplace=True)

print(f"Shape of methylation data: {methylation_df.shape}")
print(f"Shape of p-value data: {pvalue_df.shape}")
print("-" * 40)

# --- Step 3: Filter Unreliable Probes ---
# A common p-value threshold for filtering is 0.05.
# This finds probes (rows) that have at least one p-value greater than 0.05.
unreliable_probes = (pvalue_df > 0.05).any(axis=1)

num_unreliable = unreliable_probes.sum()
print(f"Number of unreliable probes to be filtered (p-value > 0.05): {num_unreliable}")

# Remove the unreliable probes from the methylation dataframe.
filtered_methylation_df = methylation_df.loc[~unreliable_probes]
print(f"Shape of filtered methylation data: {filtered_methylation_df.shape}")
print("-" * 40)

# --- Step 4: Transpose the Data ---
# Transpose the dataframe to have samples as rows and probes as columns.
# This is the standard format for machine learning models.
X = filtered_methylation_df.T

print(f"Shape of the features matrix after transposition (X): {X.shape}")
print("-" * 40)

# --- Step 5: Handle Missing Values ---
# Impute any remaining missing values using the median of each probe's measurements.
# This ensures no data is lost and prevents errors during model training.
if X.isnull().sum().sum() > 0:
    print(f"Number of missing values before imputation: {X.isnull().sum().sum()}")
    X_imputed = X.fillna(X.median())
    print(f"Number of missing values after imputation: {X_imputed.isnull().sum().sum()}")
else:
    print("No missing values found in the data.")
    X_imputed = X.copy()
print("-" * 40)


# --- Step 6: Load Labels and Prepare Final Dataset (Conceptual) ---
# NOTE: This section is conceptual. You need to provide a real labels file.
# Replace the file path and column names with your actual data.
try:
    # This is a dummy labels file for demonstration.
    # In a real scenario, you would load the actual labels file provided with the dataset.
    sample_ids = X_imputed.index
    labels = [f'Type_{i % 3}' for i in range(len(sample_ids))] # Dummy labels
    labels_df = pd.DataFrame({'SampleID': sample_ids, 'SarcomaType': labels})
    labels_df.set_index('SampleID', inplace=True)
    
    # Align the features (X) and labels (y) based on sample IDs.
    y = labels_df.loc[X_imputed.index, 'SarcomaType']
    
    print(f"Shape of the final features matrix (X): {X_imputed.shape}")
    print(f"Shape of the labels vector (y): {y.shape}")
    print("-" * 40)

    # --- Step 7: Train a Simple Classification Model ---
    # This is an example of what you can do next with the prepared data.
    print("Training a simple Random Forest Classifier model...")
    
    # Split the data into training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model.
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions and evaluate the model.
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("Model training complete.")
    print(f"Model Accuracy on test set: {accuracy:.2f}")

except FileNotFoundError:
    print("Warning: Labels file not found. Skipping model training.")
    print("Please provide a file with sample labels to proceed with model building.")


Initial dataframe shape: (1000, 971)
----------------------------------------
Shape of methylation data: (1000, 485)
Shape of p-value data: (1000, 485)
----------------------------------------
Number of unreliable probes to be filtered (p-value > 0.05): 0
Shape of filtered methylation data: (1000, 485)
----------------------------------------
Shape of the features matrix after transposition (X): (485, 1000)
----------------------------------------
Number of missing values before imputation: 10
Number of missing values after imputation: 0
----------------------------------------
Shape of the final features matrix (X): (485, 1000)
Shape of the labels vector (y): (485,)
----------------------------------------
Training a simple Random Forest Classifier model...
Model training complete.
Model Accuracy on test set: 0.24
