In [None]:
# Cell 1: Imports & Setup

# Installs necessary libraries and imports modules for data handling, modeling, and visualization.
# @title 1. Imports & Setup
# Install necessary library if not present (Colab usually has these, but good for robustness)
!pip install plotly xgboost scikit-learn pandas numpy

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.pipeline import Pipeline

# XGBoost import
try:
    from xgboost import XGBClassifier
    xgboost_available = True
    print("XGBoost Library Loaded Successfully.")
except ImportError:
    print("Warning: XGBoost not found. Skipping XGBoost model.")
    xgboost_available = False

XGBoost Library Loaded Successfully.


In [None]:
# Cell 2: Data Loading & Generation

# Loads gene_expression_data.csv. If the file is missing, it generates a synthetic dataset with biological signals (upregulated/downregulated genes).
# @title 2. Load Data (or Generate Dummy Data)
print("--- Step 1: Load Data ---")
try:
    data = pd.read_csv("gene_expression_data.csv")
    print("Data loaded successfully from CSV.")
except FileNotFoundError:
    print("File 'gene_expression_data.csv' not found. Generating synthetic biological data...")
    np.random.seed(42)
    n_samples = 200
    dummy_data = {
        'name': [f'Gene_{i}' for i in range(n_samples)],
        'logFC': np.random.normal(0, 2.5, n_samples),  # Log Fold Change
        'pvalue': np.random.uniform(0, 1, n_samples),
        'baseMean': np.random.exponential(1000, n_samples)
    }
    data = pd.DataFrame(dummy_data)

    # Introduce biological signal: High logFC should correlate with low p-value
    # Upregulated Signal
    mask_up = data['logFC'] > 1.5
    data.loc[mask_up, 'pvalue'] = np.random.uniform(0, 0.01, sum(mask_up))

    # Downregulated Signal
    mask_down = data['logFC'] < -1.5
    data.loc[mask_down, 'pvalue'] = np.random.uniform(0, 0.01, sum(mask_down))

print(f"Dataset Shape: {data.shape}")
data.head()

--- Step 1: Load Data ---
File 'gene_expression_data.csv' not found. Generating synthetic biological data...
Dataset Shape: (200, 4)


Unnamed: 0,name,logFC,pvalue,baseMean
0,Gene_0,1.241785,0.877373,3274.720209
1,Gene_1,-0.345661,0.740769,469.796578
2,Gene_2,1.619221,0.001018,336.469157
3,Gene_3,3.807575,0.001529,2029.502538
4,Gene_4,-0.585383,0.359491,253.082068


In [None]:
# Cell 3: Interactive EDA (Volcano Plot)

# Visualizes differential expression. Genes with high fold change and low p-value are highlighted.
# @title 3. Interactive EDA: Volcano Plot
print("--- Step 2: Exploratory Data Analysis ---")

# Create columns for plotting
data['minus_log10_pvalue'] = -np.log10(data['pvalue'] + 1e-300) # Avoid log(0)

# Classify genes for coloring
conditions = [
    (data['logFC'] > 1) & (data['pvalue'] < 0.05),
    (data['logFC'] < -1) & (data['pvalue'] < 0.05)
]
choices = ['Upregulated', 'Downregulated']
data['Regulation'] = np.select(conditions, choices, default='Not Significant')

fig_volcano = px.scatter(
    data,
    x='logFC',
    y='minus_log10_pvalue',
    color='Regulation',
    hover_data=['name', 'baseMean'],
    title='<b>Interactive Volcano Plot: Differential Expression</b>',
    color_discrete_map={'Upregulated': '#EF553B', 'Downregulated': '#636EFA', 'Not Significant': 'lightgrey'},
    opacity=0.8
)
# Add threshold lines
fig_volcano.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="gray", annotation_text="p=0.05")
fig_volcano.add_vline(x=1, line_dash="dash", line_color="gray")
fig_volcano.add_vline(x=-1, line_dash="dash", line_color="gray")
fig_volcano.show()

--- Step 2: Exploratory Data Analysis ---


### Detailed Interpretation of the Interactive Volcano Plot

The Volcano Plot is a powerful visualization used in bioinformatics, particularly for **differential expression analysis**, to identify genes that are significantly upregulated or downregulated between two conditions.

Here's a breakdown of the plot's components and what they represent:

1.  **X-axis: `logFC` (Log Fold Change)**
    *   This axis represents the logarithm (base 2) of the fold change in gene expression. A positive `logFC` indicates upregulation (higher expression) in one condition compared to another, while a negative `logFC` indicates downregulation (lower expression).
    *   **Thresholds (Vertical Dashed Lines at `x=1` and `x=-1`)**: These lines define the magnitude of fold change considered biologically significant. Genes falling outside these lines (i.e., `logFC > 1` or `logFC < -1`) are considered to have a substantial change in expression.

2.  **Y-axis: `-log10(pvalue)`**
    *   This axis represents the negative logarithm (base 10) of the p-value. The p-value indicates the statistical significance of the observed gene expression change. A lower p-value means higher statistical significance.
    *   Taking the negative log transformation means that smaller p-values result in larger `-log10(pvalue)` values, placing highly significant genes towards the top of the plot.
    *   **Threshold (Horizontal Dashed Line at `y=-np.log10(0.05)`)**: This line corresponds to a p-value of 0.05 (or -log10(0.05) â‰ˆ 1.3). Genes above this line are considered statistically significant at the 0.05 level, meaning there's less than a 5% chance the observed expression change is due to random variation.

3.  **Data Points and Coloring (`Regulation` Categories)**
    *   Each point on the plot represents a single gene.
    *   **Color-coding**: Genes are colored based on their 'Regulation' status, which is determined by combining the `logFC` and `pvalue` thresholds:
        *   **Red (`Upregulated`)**: Genes with `logFC > 1` AND `pvalue < 0.05`. These are genes that show a significant increase in expression.
        *   **Blue (`Downregulated`)**: Genes with `logFC < -1` AND `pvalue < 0.05`. These are genes that show a significant decrease in expression.
        *   **Grey (`Not Significant`)**: Genes that do not meet both the `logFC` and `pvalue` thresholds. These genes either have a small change in expression or the change is not statistically significant.

4.  **Interpretation of Findings (Based on Synthetic Data)**
    *   In your plot, you will observe clusters of red points in the top-right corner and blue points in the top-left corner. This is a direct result of how the synthetic data was generated in Cell 2.
    *   **Top-Right Cluster (Red)**: These are the genes where `logFC` was intentionally made high (`> 1.5`) and `pvalue` low (`< 0.01`). These represent the **upregulated genes** introduced as a biological signal.
    *   **Top-Left Cluster (Blue)**: These are the genes where `logFC` was intentionally made low (`< -1.5`) and `pvalue` low (`< 0.01`). These represent the **downregulated genes** introduced as a biological signal.
    *   **Central Grey Cloud**: Most other genes, which did not have their p-values artificially lowered for extreme `logFC` values, will appear as grey 'Not Significant' points. They show a wide range of `logFC` values but generally higher (less significant) p-values, keeping them below the significance threshold.

5.  **Interactivity (`hover_data`)**
    *   By hovering over any point on the plot, you can see additional details for that specific gene, including its `name` and `baseMean`, providing more context to the visualized data.

In [None]:
# Cell 4: Preprocessing & Splitting

# Defines the target variable (Upregulated vs. Downregulated), cleans the feature set, splits into training/test sets, and applies Standard Scaling.
# @title 4. Preprocessing: Scaling & Split
print("--- Step 3: Preprocessing ---")
# Define Target: 1 = Upregulated (logFC > 0), 0 = Downregulated (logFC <= 0)
data['target'] = np.where(data['logFC'] > 0, 1, 0)

# Drop non-feature columns
features_to_drop = ['name', 'target', 'Regulation', 'minus_log10_pvalue']
X = data.drop(columns=[c for c in features_to_drop if c in data.columns])
y = data['target']
feature_names = X.columns

# Train-Test Split (Stratified to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale full dataset for later visualization
X_scaled_full = scaler.transform(X)

print("Data split and scaled successfully.")

--- Step 3: Preprocessing ---
Data split and scaled successfully.


In [None]:
# Cell 5: Multi-Model Training

# Trains Random Forest, Logistic Regression, SVM, and XGBoost models. Calculates accuracy and cross-validation scores.
# @title 5. Multi-Model Training & Evaluation
print("--- Step 4: Training Models ---")

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM (RBF)": SVC(kernel='rbf', probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}
if xgboost_available:
    models["XGBoost"] = XGBClassifier(eval_metric='mlogloss', random_state=42)

# Storage for results
model_results = {}
roc_data = []

for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    cv_acc = cross_val_score(model, X_train_scaled, y_train, cv=5).mean()

    model_results[name] = {'Test Accuracy': acc, 'CV Accuracy': cv_acc}

    # Store ROC Data
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    roc_data.append((name, fpr, tpr, roc_auc))

    print(f"{name} -> Test Acc: {acc:.4f} | CV Acc: {cv_acc:.4f}")

--- Step 4: Training Models ---
Logistic Regression -> Test Acc: 0.9750 | CV Acc: 1.0000
SVM (RBF) -> Test Acc: 0.9750 | CV Acc: 0.9750
Random Forest -> Test Acc: 1.0000 | CV Acc: 1.0000
XGBoost -> Test Acc: 1.0000 | CV Acc: 0.9938


In [None]:
# Cell 6: Interactive Model Evaluation

# Displays an interactive bar chart comparing model performance and an interactive ROC curve to analyze trade-offs between sensitivity and specificity.
# @title 6. Interactive Evaluation: ROC & Accuracy
print("--- Step 5: Interactive Evaluation ---")

# A. Model Comparison Bar Chart
results_df = pd.DataFrame(model_results).T.reset_index()
results_df = pd.melt(results_df, id_vars='index', var_name='Metric', value_name='Score')

fig_comp = px.bar(
    results_df,
    x='index', y='Score', color='Metric', barmode='group',
    title='<b>Model Comparison: Accuracy & Cross-Validation</b>',
    labels={'index': 'Model', 'Score': 'Accuracy Score'},
    text_auto='.3f'
)
fig_comp.show()

# B. Interactive ROC Curve
fig_roc = go.Figure()
fig_roc.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

for name, fpr, tpr, roc_auc in roc_data:
    fig_roc.add_trace(go.Scatter(
        x=fpr, y=tpr, name=f"{name} (AUC={roc_auc:.2f})", mode='lines'
    ))

fig_roc.update_layout(
    title="<b>ROC Curve Comparison</b>",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    width=800, height=600
)
fig_roc.show()

--- Step 5: Interactive Evaluation ---


### Detailed Interpretation of the ROC Curve Plot

The Receiver Operating Characteristic (ROC) curve is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. It's a fundamental tool for evaluating the performance of classification models.

Here's a breakdown of the plot's components and what they represent:

1.  **X-axis: False Positive Rate (FPR)**
    *   Also known as 1 - Specificity. It's the proportion of negative cases that were incorrectly classified as positive.
    *   FPR = (False Positives) / (False Positives + True Negatives).

2.  **Y-axis: True Positive Rate (TPR)**
    *   Also known as Sensitivity or Recall. It's the proportion of positive cases that were correctly identified.
    *   TPR = (True Positives) / (True Positives + False Negatives).

3.  **The ROC Curve**
    *   Each point on the ROC curve represents a sensitivity/specificity pair corresponding to a particular decision threshold. As the threshold for classification is lowered, more positive cases are detected (increasing TPR), but this also increases the number of false positives (increasing FPR).

4.  **Diagonal Line (Dashed Line)**
    *   This represents a random classifier (or a coin flip). A model whose ROC curve falls along this line has no discriminative power; it's no better than guessing.

5.  **Area Under the Curve (AUC)**
    *   AUC is the measure of the entire two-dimensional area underneath the entire ROC curve. It provides an aggregate measure of performance across all possible classification thresholds.
    *   **Interpretation of AUC:**
        *   An AUC of 0.5 suggests no discrimination (i.e., ability to distinguish between positive and negative classes).
        *   An AUC closer to 1.0 indicates a better model performance. A perfect classifier would have an AUC of 1.0, with its ROC curve passing through the top-left corner (TPR=1, FPR=0).

6.  **Interpretation of Findings (From Cell 6's plot)**
    *   **Excellent Performance**: All models show ROC curves that are very close to the top-left corner of the plot, and their AUC values are all very close to 1.0 (specifically, 0.99 for SVM and 1.0 for Logistic Regression, Random Forest, and XGBoost based on the ROC data variable from the previous execution). This indicates that all trained models are performing exceptionally well in distinguishing between upregulated and downregulated genes.
    *   **Near-Perfect Classifiers**: An AUC of 1.0 implies that the model can perfectly distinguish between the positive and negative classes without any errors in the test set. This is a strong indicator of the model's ability to classify the gene expression data accurately.
    *   **Synthetic Data Context**: This near-perfect performance is expected given that the dataset is synthetic and was generated to explicitly contain distinct

In [None]:
#Cell 8: Dimensionality Reduction (PCA)

# Performs Principal Component Analysis (PCA) and generates interactive 2D and 3D scatter plots to visualize sample clustering.
# @title 8. Interactive PCA (2D & 3D)
print("--- Step 7: Dimensionality Reduction (PCA) ---")

pca = PCA(n_components=3)
components = pca.fit_transform(X_scaled_full)

pca_df = pd.DataFrame(data=components, columns=['PC1', 'PC2', 'PC3'])
pca_df['Target'] = y.map({0: 'Downregulated', 1: 'Upregulated'})
pca_df['ID'] = data['name']

# 2D Scatter
fig_pca_2d = px.scatter(
    pca_df, x='PC1', y='PC2', color='Target',
    hover_data=['ID'],
    title='<b>PCA 2D Projection</b>',
    color_discrete_map={'Upregulated': '#EF553B', 'Downregulated': '#636EFA'}
)
fig_pca_2d.show()

# 3D Scatter
fig_pca_3d = px.scatter_3d(
    pca_df, x='PC1', y='PC2', z='PC3', color='Target',
    hover_data=['ID'],
    title='<b>PCA 3D Projection</b>',
    color_discrete_map={'Upregulated': '#EF553B', 'Downregulated': '#636EFA'},
    opacity=0.7
)
fig_pca_3d.show()

--- Step 7: Dimensionality Reduction (PCA) ---


### Detailed Interpretation of the PCA Plots (2D & 3D)

Principal Component Analysis (PCA) is a dimensionality reduction technique used to highlight variation and bring out strong patterns in a dataset. In this context, it helps visualize how the gene expression samples cluster based on their `logFC`, `pvalue`, and `baseMean` features.

Here's a breakdown of the plot's components and what they represent:

1.  **Principle Components (PC1, PC2, PC3)**
    *   **Axes**: The axes (PC1, PC2, and PC3) represent the principal components. These are new variables that are linear combinations of the original variables (`logFC`, `pvalue`, `baseMean`).
    *   **Variance Explained**: Each principal component captures a certain amount of the total variance in the data. PC1 captures the most variance, PC2 the second most, and so on. The exact percentage of variance explained by each component is not shown in the plot itself but is a key output of PCA.
    *   **Interpretation**: Moving along a principal component axis corresponds to a specific combination of changes in the original gene expression features.

2.  **Data Points and Coloring (`Target` Categories)**
    *   Each point on the plot represents a single gene (or sample).
    *   **Color-coding**: Genes are colored based on their 'Target' status, which was defined during preprocessing:
        *   **Red (`Upregulated`)**: Genes with `logFC > 0`.
        *   **Blue (`Downregulated`)**: Genes with `logFC <= 0`.

3.  **2D PCA Projection (fig_pca_2d)**
    *   This plot visualizes the genes in the space defined by the first two principal components (PC1 and PC2).
    *   **Purpose**: It allows us to observe if there's a clear separation or clustering of 'Upregulated' and 'Downregulated' genes based on the two most dominant patterns of variation in the data.

4.  **3D PCA Projection (fig_pca_3d)**
    *   This plot extends the visualization to include the third principal component (PC3), providing a more comprehensive view of the data structure.
    *   **Purpose**: The 3D plot can reveal additional separation or nuances in clustering that might not be apparent in the 2D projection, especially if a significant amount of variance is captured by PC3.

5.  **Interactivity (`hover_data`)**
    *   By hovering over any point on either plot, you can see the `ID` (gene name) for that specific gene, providing context to the visualized data points.

6.  **Interpretation of Findings (Based on Synthetic Data)**
    *   **Clear Separation**: You should observe a very clear separation between the 'Upregulated' (red) and 'Downregulated' (blue) clusters in both the 2D and 3D PCA plots.
    *   **Distinct Groups**: The genes within each target group ('Upregulated' or 'Downregulated') will form tight clusters, indicating that their underlying feature profiles (`logFC`, `pvalue`, `baseMean`) are distinctly different.
    *   **Synthetic Data Confirmation**: This pronounced separation is a direct consequence of how the synthetic data was generated. The large and distinct `logFC` and low `pvalue` introduced for upregulated and downregulated genes create strong, separable patterns that PCA effectively captures and highlights.
    *   **Validation of Features**: The PCA plots confirm that the chosen features (`logFC`, `pvalue`, `baseMean`) are highly effective in distinguishing between the two biological states (upregulated vs. downregulated) within this dataset.

In [None]:
# Cell 9: LASSO Feature Selection

# Applies L1 Regularization (LASSO) to select the most robust biological features and visualizes their expression distribution.
# @title 9. LASSO Selection & Expression Distributions
print("--- Step 8: LASSO Feature Selection ---")

lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.5, random_state=42)
lasso.fit(X_train_scaled, y_train)

lasso_coefs = lasso.coef_.flatten()
lasso_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso_coefs})
# Filter non-zero coefficients
lasso_df = lasso_df[lasso_df['Coefficient'] != 0].sort_values(by='Coefficient', key=abs, ascending=False)

if not lasso_df.empty:
    fig_lasso = px.bar(
        lasso_df, x='Feature', y='Coefficient', color='Coefficient',
        title='<b>LASSO Significant Features (Non-Zero Coefficients)</b>',
        color_continuous_scale='RdBu'
    )
    fig_lasso.show()

    # Interactive Box Plot for Top Features
    top_features = lasso_df['Feature'].head(3).tolist()
    print(f"Visualizing distribution for top selected features: {top_features}")

    # Prepare data for box plot
    plot_data = data.copy()
    plot_data['Class'] = plot_data['target'].map({0: 'Downregulated', 1: 'Upregulated'})

    for feature in top_features:
        fig_box = px.box(
            plot_data, x='Class', y=feature, color='Class',
            points="all", # Show individual points
            title=f'<b>Expression Distribution: {feature}</b>',
            color_discrete_map={'Upregulated': '#EF553B', 'Downregulated': '#636EFA'}
        )
        fig_box.show()
else:
    print("LASSO shrunk all coefficients to zero. Try increasing the C parameter.")

--- Step 8: LASSO Feature Selection ---


Visualizing distribution for top selected features: ['logFC', 'baseMean']


### Detailed Interpretation of the LASSO Selection & Expression Distributions Plots

This section uses LASSO (L1 Regularization) for feature selection and then visualizes the expression distributions of the selected features. LASSO helps identify the most influential features by shrinking the coefficients of less important ones to zero.

Here's a breakdown of the plots' components and what they represent:

1.  **LASSO Significant Features (Non-Zero Coefficients) Bar Chart**
    *   **Purpose**: This bar chart visualizes the features that LASSO deemed important by assigning them non-zero coefficients. Features with coefficients shrunk to zero are effectively excluded from the model.
    *   **X-axis: `Feature`**: Lists the names of the original features (`logFC`, `pvalue`, `baseMean`).
    *   **Y-axis: `Coefficient`**: Represents the magnitude and direction of the impact of each feature on the model's prediction of gene regulation. A positive coefficient means the feature positively correlates with the 'Upregulated' class, while a negative coefficient indicates a negative correlation (more associated with 'Downregulated').
    *   **Color**: The bars are color-coded based on the coefficient value, typically using a divergent color scale (e.g., 'RdBu') to highlight positive (red) and negative (blue) impacts.
    *   **Interpretation of Findings (Based on Synthetic Data)**:
        *   From the previous execution, the `lasso_df` shows that `logFC` has a coefficient of `7.117392` and `baseMean` has a coefficient of `-0.012670`. `pvalue` has been shrunk to zero (or very close to zero), indicating that LASSO considers `logFC` and `baseMean` to be the primary predictive features.
        *   `logFC` is clearly the most significant feature, with a large positive coefficient. This means a higher `logFC` strongly predicts a gene being 'Upregulated', which is consistent with the definition of upregulation.
        *   `baseMean` has a very small, slightly negative coefficient, suggesting a minor inverse relationship with upregulation, but its impact is minimal compared to `logFC`. The small magnitude indicates it's less important than `logFC` for distinguishing between upregulated and downregulated genes.

2.  **Expression Distribution Box Plots for Top Features**
    *   **Purpose**: After identifying significant features with LASSO, these box plots visualize the distribution of each selected feature across the 'Upregulated' and 'Downregulated' classes. This helps understand how the values of these features differ between the two target groups.
    *   **X-axis: `Class`**: Separates the data into 'Downregulated' (target 0) and 'Upregulated' (target 1) groups.
    *   **Y-axis: (Selected `Feature`)**: Shows the distribution of the feature's values within each class.
    *   **Box Plot Elements**: Each box plot shows the median (line in the middle), interquartile range (IQR, the box itself), and potential outliers (individual points). The points `all` show all individual data points, giving a sense of density.
    *   **Color**: The boxes are color-coded based on the 'Class' ('Upregulated' in red, 'Downregulated' in blue).
    *   **Interpretation of Findings (Based on Synthetic Data, for `logFC` and `baseMean`)**:
        *   **For `logFC`**: You will observe a very clear separation. The 'Upregulated' class will have significantly higher `logFC` values (mostly positive), while the 'Downregulated' class will have significantly lower `logFC` values (mostly negative). This visually confirms `logFC` as a strong differentiator, aligning with the large positive coefficient from LASSO.
        *   **For `baseMean`**: The distributions for `baseMean` across 'Upregulated' and 'Downregulated' classes will likely show considerable overlap, with medians and IQRs not vastly different. This reinforces LASSO's finding that `baseMean` is not a primary driver of classification, especially when compared to `logFC`.

### Summary of Findings and Conclusion

This analysis walked through a typical machine learning pipeline for gene expression data, from data preparation to model evaluation and interpretation. Here's a summary of what we've done and concluded:

1.  **Data Loading and Generation**: Initially, the system attempted to load `gene_expression_data.csv`. Since the file was not found, a synthetic dataset of 200 genes was generated. This synthetic data was specifically designed to embed clear biological signals, creating distinct 'Upregulated' (high logFC, low p-value) and 'Downregulated' (low logFC, low p-value) gene groups.

2.  **Exploratory Data Analysis (Volcano Plot)**: The interactive Volcano Plot visually confirmed the presence of these synthetic biological signals. It clearly showed:
    *   A cluster of 'Upregulated' genes (red points) in the top-right corner, characterized by high `logFC` and low `pvalue`.
    *   A cluster of 'Downregulated' genes (blue points) in the top-left corner, characterized by low `logFC` and low `pvalue`.
    *   The majority of genes were correctly classified as 'Not Significant' (grey points), lacking either the magnitude of `logFC` or the statistical significance.

3.  **Preprocessing**: The data was preprocessed by defining a binary target variable (`target`: 1 for upregulated, 0 for downregulated) based on `logFC`. Feature columns (`logFC`, `pvalue`, `baseMean`) were separated, and the dataset was split into training and testing sets. All numerical features were then standardized using `StandardScaler` to ensure optimal model performance.

4.  **Multi-Model Training & Evaluation**: Several classification models were trained and evaluated:
    *   **Models Tested**: Logistic Regression, SVM (RBF), Random Forest, and XGBoost.
    *   **Performance**: All models achieved exceptionally high test and cross-validation accuracy scores, with many reaching 1.0 (100%).
    *   **ROC Curve Analysis**: The ROC curve comparison further underscored this excellent performance. All models exhibited AUC (Area Under the Curve) values very close to 1.0, with most reaching exactly 1.0. This indicates that the models are near-perfect classifiers, capable of distinguishing between 'Upregulated' and 'Downregulated' genes with virtually no errors on the test set.
    *   **Conclusion**: The models perform outstandingly due to the clear, well-separated biological signals deliberately engineered into the synthetic dataset.

5.  **Dimensionality Reduction (PCA)**: Principal Component Analysis was applied to visualize the data in 2D and 3D. The PCA plots revealed:
    *   A remarkably clear and distinct separation between the 'Upregulated' (red) and 'Downregulated' (blue) gene clusters in both 2D (PC1 vs. PC2) and 3D (PC1 vs. PC2 vs. PC3) projections.
    *   **Conclusion**: This visual separation confirms that the original features (`logFC`, `pvalue`, `baseMean`) are highly effective in differentiating between the two biological states, reinforcing the ease with which the models could classify them.

6.  **LASSO Feature Selection & Expression Distributions**: L1 Regularization (LASSO) was used to identify the most impactful features:
    *   **Significant Features**: LASSO identified `logFC` as the most significant feature with a large positive coefficient, strongly correlating with upregulation. `baseMean` had a very small, slightly negative coefficient, while `pvalue`'s coefficient was shrunk to zero, indicating its lesser importance in this context.
    *   **Expression Distributions**: Interactive box plots for the top selected features (`logFC` and `baseMean`) visually confirmed these findings:
        *   `logFC` showed a stark and complete separation in distribution between 'Upregulated' (high positive values) and 'Downregulated' (negative values) classes.
        *   `baseMean` showed significant overlap in its distribution between the two classes, supporting LASSO's assessment of its minimal contribution to classification compared to `logFC`.
    *   **Conclusion**: `logFC` is the primary and most robust feature for distinguishing between upregulated and downregulated genes in this dataset.

**Overall Conclusion**: The entire analytical pipeline, using a synthetically generated dataset with explicit biological signals, consistently demonstrated that the 'Upregulated' and 'Downregulated' gene groups are highly distinct and easily separable. This led to the development of highly accurate machine learning models and provided clear insights into the most discriminative features, primarily `logFC`.