In [0]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [0]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output
from mygene import MyGeneInfo

import sys
sys.path.append('/Workspace/Users/bjedelma@gmail.com/Alzheimers-MRI-Classification/src')
from data_io import save_model_s3, load_model_s3, save_pickle_s3, load_pickle_s3

clear_output(wait=False)

In [0]:
ACCESS_KEY = dbutils.secrets.get(scope="brad-aws", key="access_key")
SECRET_KEY= dbutils.secrets.get(scope="brad-aws", key="secret_key")

# specify bucket and mount point
AWS_S3_BUCKET = "databricks-workspace-stack-brad-personal-bucket/Omics-breast-cancer/"
MOUNT_NAME = f"/mnt/{AWS_S3_BUCKET.split('/')[-2]}"
SOURCE_URL = f"s3a://{AWS_S3_BUCKET}"
EXTRA_CONFIGS = { "fs.s3a.access.key": ACCESS_KEY, "fs.s3a.secret.key": SECRET_KEY}

# mount bucket
if any(mount.mountPoint == MOUNT_NAME for mount in dbutils.fs.mounts()):
    print(f"{MOUNT_NAME} is already mounted.")
else:
    dbutils.fs.mount(SOURCE_URL, MOUNT_NAME, extra_configs = EXTRA_CONFIGS)
    print(f"{MOUNT_NAME} is now mounted.")

/mnt/Omics-breast-cancer is already mounted.


In [0]:
# Load data file and unpack contents
bucket_name = "databricks-workspace-stack-brad-personal-bucket"
s3_file_name = 'Omics-breast-cancer/TCGA-BRCA.star_fpkm.tsv'

# Load data file and unpack contents
df = spark.read.csv(f"s3a://{bucket_name}/{s3_file_name}", sep='\t', header=True, inferSchema=True)

# Transpose to orient measurements on 1st dim and ensemble IDs (features) on 2n dim
df = df.toPandas()
df = df.T
df = spark.createDataFrame(df)
clear_output(wait=False)

In [0]:
# Display the first few rows
display(df.head(10))

com.databricks.backend.common.rpc.DriverStoppedException: Driver down cause: driver state change (exit code: 15)
	at com.databricks.spark.chauffeur.ChauffeurState.processDriverStateChange(ChauffeurState.scala:306)
	at com.databricks.spark.chauffeur.Chauffeur.onDriverStateChange(Chauffeur.scala:1625)
	at com.databricks.spark.chauffeur.Chauffeur.$anonfun$driverStateOpt$1(Chauffeur.scala:219)
	at com.databricks.spark.chauffeur.Chauffeur.$anonfun$driverStateOpt$1$adapted(Chauffeur.scala:219)
	at com.databricks.spark.chauffeur.DriverDaemonMonitorImpl.$anonfun$goToStopped$4(DriverDaemonMonitorImpl.scala:295)
	at com.databricks.spark.chauffeur.DriverDaemonMonitorImpl.$anonfun$goToStopped$4$adapted(DriverDaemonMonitorImpl.scala:295)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at com.databricks.spark.chauffeur.DriverDaemonMonitorImpl.goToStopped(DriverDaemonMonitorImpl.scala:295)
	at com.databricks.spark.chauffeur.DriverDaemonMonitorImpl.monitorDriver(DriverDaemonMonitorImpl.sc

In [0]:
# Example gene lookup


mg = MyGeneInfo()
result = mg.querymany(['ENSG00000001084'], scopes='ensembl.gene', fields='symbol', species='human')
clear_output(wait=False)
print(result)

[{'query': 'ENSG00000001084', '_id': '2729', '_score': 32.85163, 'symbol': 'GCLC'}]


In [0]:
# Separate metadata and gene expression data
metadata = data.iloc[:, :20]  # Assuming first 20 columns are metadata
gene_expression = data.iloc[:, 20:]  # Gene expression values

# Check for missing values and handle them
print("Missing values:", gene_expression.isnull().sum().sum())
gene_expression.fillna(0, inplace=True)  # Fill missing values with 0

# Normalize gene expression data using StandardScaler
scaler = StandardScaler()
gene_expression_scaled = scaler.fit_transform(gene_expression)

# Encode the target labels (subtypes) from metadata
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(metadata['subtype'])  # Adjust 'subtype' to actual column name


In [0]:
# Visualize the distribution of breast cancer subtypes
sns.countplot(x=metadata['subtype'], palette='viridis')
plt.title('Distribution of Breast Cancer Subtypes')
plt.show()

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
principal_components = pca.fit_transform(gene_expression_scaled)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['Subtype'] = metadata['subtype']

# Plot PCA results
sns.scatterplot(x='PC1', y='PC2', hue='Subtype', data=pca_df, palette='viridis')
plt.title('PCA of Gene Expression Data')
plt.show()
