<a href="https://colab.research.google.com/github/AnanyaSourav/Minor-Skin-Cancer-Project/blob/main/RinColab_C%26N.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required libraries (if not installed)
if (!require("dplyr")) install.packages("dplyr", dependencies=TRUE)
if (!require("tidyr")) install.packages("tidyr", dependencies=TRUE)

# Load libraries
library(dplyr)
library(tidyr)

# Set file paths
input_file <- "/content/sample_data/variance_statistics.csv"  # Update with Colab path
output_file <- "/content/sample_data/RESgene_clusters_all_4.csv"

# Read the data properly
data <- read.csv(input_file, stringsAsFactors = FALSE)

# Check column names
print("Column names in dataset:")
print(colnames(data))

# Ensure required columns exist
required_cols <- c("HGNC_Symbol", "SD_CNA", "SD_Exp")
missing_cols <- setdiff(required_cols, colnames(data))

if (length(missing_cols) > 0) {
    stop(paste("Missing columns:", paste(missing_cols, collapse = ", ")))
}

# Remove rows with missing values in SD_CNA or SD_Exp
data <- data %>% select(HGNC_Symbol, SD_CNA, SD_Exp) %>% drop_na()

# Remove duplicate HGNC_Symbol entries
data <- data %>% distinct(HGNC_Symbol, .keep_all = TRUE)

# Normalize SD_CNA and SD_Exp
data <- data %>%
  mutate(SD_CNA = scale(SD_CNA), SD_Exp = scale(SD_Exp))

# Perform k-means clustering (3 clusters, matching original file)
set.seed(42)
kmeans_result <- kmeans(data[, c("SD_CNA", "SD_Exp")], centers = 3, iter.max = 50, nstart = 10)

# Assign clusters and ensure labels are 1, 2, 3
cluster_data <- data %>%
  mutate(Cluster = as.numeric(factor(kmeans_result$cluster, labels = c(1, 2, 3)))) %>%
  select(HGNC_Symbol, Cluster)

# Save output
write.csv(cluster_data, output_file, row.names = FALSE)

print("✅ Clustering complete! File saved as RESgene_clusters_all.csv")


[1] "Column names in dataset:"
[1] "HGNC_Symbol" "Mean_CNA"    "SD_CNA"      "Mean_Exp"    "SD_Exp"     
[1] "✅ Clustering complete! File saved as RESgene_clusters_all.csv"


In [4]:
# Install required packages if not already installed
if (!requireNamespace("igraph", quietly = TRUE)) install.packages("igraph", dependencies=TRUE)
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2", dependencies=TRUE)
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr", dependencies=TRUE)
if (!requireNamespace("ggraph", quietly = TRUE)) install.packages("ggraph", dependencies=TRUE)
if (!requireNamespace("tidygraph", quietly = TRUE)) install.packages("tidygraph", dependencies=TRUE)

# Load libraries
library(igraph)
library(ggplot2)
library(dplyr)
library(ggraph)
library(tidygraph)

# Define file path (Update as per Colab)
input_file <- "//content/sample_data/RESgene_clusters_all_4.csv"

# Load the cluster data
clusters <- read.csv(input_file, stringsAsFactors = FALSE)
colnames(clusters) <- c("HGNC_Symbol", "Cluster")  # Ensure correct column names

# Determine the number of clusters
num_clusters <- length(unique(clusters$Cluster))
genes_per_cluster <- floor(100 / num_clusters)  # Equal distribution of 100 genes/50 gene can be used to get clear understanding of network

# Sample genes equally from each cluster
set.seed(42)  # Ensure reproducibility
selected_genes <- clusters %>%
  group_by(Cluster) %>%
  sample_n(size = genes_per_cluster, replace = FALSE) %>%
  ungroup()

# Display selected genes count
print(table(selected_genes$Cluster))

# Generate a sparse adjacency matrix (random edges)
set.seed(42)

# Create possible edges (excluding self-loops)
edges <- expand.grid(gene1 = selected_genes$HGNC_Symbol, gene2 = selected_genes$HGNC_Symbol)
edges <- edges[edges$gene1 != edges$gene2, ]  # Remove self-loops

# Randomly select 10% of possible edges for a sparse network
edges <- edges[sample(nrow(edges), size = round(0.10 * nrow(edges))), ]

# Convert to igraph object
graph <- graph_from_data_frame(edges, directed = FALSE)

# Convert to a tidygraph object for visualization
graph_tbl <- as_tbl_graph(graph) %>%
  mutate(cluster = as.factor(selected_genes$Cluster))

# Save and display optimized network visualization
output_network <- "/content/gene_network_balanced_100.png"
png(output_network, width = 800, height = 600)

ggraph(graph_tbl, layout = "fr") +
  geom_edge_link(aes(alpha = 0.5), color = "gray") +
  geom_node_point(aes(color = cluster), size = 4) +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  theme_graph()

dev.off()

cat("Network image saved to:", output_network, "\n")


 1  2  3 
33 33 33 


Network image saved to: /content/gene_network_balanced_100.png 
