# Genomics Data Mining Notebook

This notebook established a reproducible analysis environment an valdates core setup. 

In [1]:
from __future__ import annotations

import os
import random
import sys
import platform
from pathlib import Path
import importlib.metadata

import numpy as np
import pandas as pd
import sklearn

from IPython.display import display, Markdown

# Pandas display defaults (readable but not overwhelming)
pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)
pd.set_option("display.max_colwidth", 60)


## Environment Validation (Baseline Compatibility)

To support reproducibility, this project defines a baseline environment in `environment.yml`.
The table below compares:

- **Expected** versions (baseline)
- **Installed** versions (current runtime)

If any items mismatch, the notebook may still run, but results (especially plots, clustering, or model behavior)
could differ slightly due to dependency changes.


In [2]:
import sys
import importlib.metadata
import pandas as pd
from IPython.display import display

EXPECTED = {
    "python": "3.12.7",
    "ipykernel": "6.31.0",
    "jupyterlab": "4.5.0",
    "matplotlib": "3.10.8",
    "numpy": "2.4.1",
    "pandas": "2.3.3",
    "pip": "25.3",
    "requests": "2.32.5",
    "scikit-learn": "1.8.0",
    "scipy": "1.16.3",
    "seaborn": "0.13.2",
}

# Installed distributions (lowercased for matching)
installed = {d.metadata["Name"].lower(): d.version for d in importlib.metadata.distributions()}

# Special cases:
installed["python"] = sys.version.split()[0]  # Python runtime version
# scikit-learn's distribution name is "scikit-learn" already, but keep explicit mapping clean
# (No extra mapping required given your EXPECTED keys)

rows = []
for pkg in sorted(EXPECTED.keys(), key=str.lower):
    expected = EXPECTED[pkg]
    got = installed.get(pkg.lower(), "(not installed)")
    status = "OK" if got == expected else "MISMATCH"
    rows.append({"Package": pkg, "Expected": expected, "Installed": got, "Status": status})

df_env_check = pd.DataFrame(rows)

display(df_env_check)

mismatch_count = (df_env_check["Status"] == "MISMATCH").sum()
if mismatch_count == 0:
    print("✅ Environment matches baseline (all versions OK).")
else:
    print(f"⚠️ Environment differs from baseline in {mismatch_count} package(s).")



Unnamed: 0,Package,Expected,Installed,Status
0,ipykernel,6.31.0,6.31.0,OK
1,jupyterlab,4.5.0,4.5.0,OK
2,matplotlib,3.10.8,3.10.8,OK
3,numpy,2.4.1,2.4.1,OK
4,pandas,2.3.3,2.3.3,OK
5,pip,25.3,25.3,OK
6,python,3.12.7,3.14.2,MISMATCH
7,requests,2.32.5,2.32.5,OK
8,scikit-learn,1.8.0,1.8.0,OK
9,scipy,1.16.3,1.16.3,OK


⚠️ Environment differs from baseline in 1 package(s).


In [3]:
env_info = pd.DataFrame(
    [
        {"Item": "Python", "Value": sys.version.split()[0]},
        {"Item": "Platform", "Value": platform.platform()},
        {"Item": "NumPy", "Value": np.__version__},
        {"Item": "Pandas", "Value": pd.__version__},
        {"Item": "Scikit-learn", "Value": sklearn.__version__},
    ]
)

display(Markdown("## Environment Summary"))
display(env_info)


## Environment Summary

Unnamed: 0,Item,Value
0,Python,3.14.2
1,Platform,Windows-11-10.0.26200-SP0
2,NumPy,2.4.1
3,Pandas,2.3.3
4,Scikit-learn,1.8.0


In [4]:
import pandas
display(Markdown("## Key Package Versions"))

KEY_PACKAGES = [
    "pandas",
    "matplotlib",
    "jupyterlab",
    "seaborn",
    "numpy",
    "requests",
    "scikit-learn",
    "ipykernel",
    "scipy",
    "python",
    "pip"
]

rows = []
installed = {d.metadata["Name"].lower(): d.version for d in importlib.metadata.distributions()}

for name in sorted(KEY_PACKAGES, key=str.lower):
    version = installed.get(name.lower(), "(not installed)")
    rows.append({"Package": name, "Version": version})

df_key = pd.DataFrame(rows)

display(df_key)


## Key Package Versions

Unnamed: 0,Package,Version
0,ipykernel,6.31.0
1,jupyterlab,4.5.0
2,matplotlib,3.10.8
3,numpy,2.4.1
4,pandas,2.3.3
5,pip,25.3
6,python,(not installed)
7,requests,2.32.5
8,scikit-learn,1.8.0
9,scipy,1.16.3


## Project Paths

This notebook uses project-relative paths so it runs the same way on any machine,
regardless of where the repository lives on disk (including paths with spaces).


In [5]:
from pathlib import Path
import pandas as pd
from IPython.display import display

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR if (NOTEBOOK_DIR / "data").exists() else NOTEBOOK_DIR.parent

DATA_DIR = PROJECT_ROOT / "data"
PACKAGED_DIR = DATA_DIR / "external" / "packaged"
EXTRACTED_DIR = DATA_DIR / "external" / "extracted"
INTERIM_DIR = DATA_DIR / "interim"
FINAL_DIR = DATA_DIR / "final"

for p in [PACKAGED_DIR, EXTRACTED_DIR, INTERIM_DIR, FINAL_DIR]:
    p.mkdir(parents=True, exist_ok=True)

def rel(p: Path) -> str:
    return str(p.relative_to(PROJECT_ROOT))

paths = pd.DataFrame([
    {"Folder": "Packaged downloads", "Path": rel(PACKAGED_DIR)},
    {"Folder": "Extracted files", "Path": rel(EXTRACTED_DIR)},
    {"Folder": "Interim working data", "Path": rel(INTERIM_DIR)},
    {"Folder": "Final curated outputs", "Path": rel(FINAL_DIR)},
])

display(paths)


Unnamed: 0,Folder,Path
0,Packaged downloads,data\external\packaged
1,Extracted files,data\external\extracted
2,Interim working data,data\interim
3,Final curated outputs,data\final


In [6]:
display(Markdown("## Reproducibility Settings"))

RANDOM_SEED = 3660  # course number

os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

seed_info = pd.DataFrame([{"Setting": "RANDOM_SEED", "Value": RANDOM_SEED}])
display(seed_info)

# Note for later:
# - use random_state=RANDOM_SEED in sklearn models
# - TSNE(random_state=RANDOM_SEED)
# - train_test_split(..., random_state=RANDOM_SEED)


## Reproducibility Settings

Unnamed: 0,Setting,Value
0,RANDOM_SEED,3660


## Task 1 — Data Acquisition

The Brain Lower Grade Glioma (TCGA, PanCancer Atlas) dataset was obtained from the
cBioPortal for Cancer Genomics. The dataset was downloaded as a compressed tarball
containing clinical, molecular, and metadata files.

Source:
https://www.cbioportal.org/

Dataset:
**Brain Lower Grade Glioma (TCGA, PanCancer Atlas)**  
File: `lgg_tcga_pan_can_atlas_2018.tar.gz`


In [8]:
import requests
from pathlib import Path

DATASET_URL = "https://datahub.assets.cbioportal.org/lgg_tcga_pan_can_atlas_2018.tar.gz"
TARBALL_PATH = PACKAGED_DIR / "lgg_tcga_pan_can_atlas_2018.tar.gz"

if not TARBALL_PATH.exists():
    print("Downloading dataset...")
    response = requests.get(DATASET_URL, stream=True)
    response.raise_for_status()
    with open(TARBALL_PATH, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print("Dataset already downloaded.")

TARBALL_PATH



Downloading dataset...
Download complete.


WindowsPath('c:/Users/goodb/OneDrive/UPEI/Courses/STAT 3660/dev/genomics-data-mining-project/data/external/packaged/lgg_tcga_pan_can_atlas_2018.tar.gz')

In [12]:
import tarfile

expected_file = EXTRACTED_DIR / "data_clinicial_patient.txt"

if not expected_file.exists():
    print("Extracting dataset...")
    with tarfile.open(TARBALL_PATH, "r:gz") as tar:
        tar.extractall(EXTRACTED_DIR)
    print("Extraction complete.")
else:
    print("Dataset already extracted.")


Extracting dataset...
Extraction complete.


## Task 2 — Dataset Overview (cBioPortal Summary)

Using information from the cBioPortal Summary tab and the downloaded data files,
the following provides a high-level overview of the dataset, including patient counts,
molecular data availability, mutation frequencies, and survival information.


In [14]:
import pandas as pd

patient_file = EXTRACTED_DIR / "lgg_tcga_pan_can_atlas_2018/data_clinical_patient.txt"

df_patients = pd.read_csv(
    patient_file,
    sep="\t",
    comment="#"
)

df_patients.shape[0]


514

**Total number of patients:**  
The dataset contains **X patients**, based on the clinical patient file.


In [15]:
mrna_file = EXTRACTED_DIR / "lgg_tcga_pan_can_atlas_2018/data_mrna_seq_v2_rsem.txt"

df_mrna = pd.read_csv(
    mrna_file,
    sep="\t",
    comment="#",
    nrows=1  # just need headers for now
)

num_samples_mrna = df_mrna.shape[1] - 1  # first column = gene ID
num_samples_mrna


515

**Patients with mRNA expression data:**  
There are **X samples** with mRNA expression profiles, as indicated by the
number of sample columns in the mRNA expression dataset.
