<a href="https://colab.research.google.com/github/AnanyaSourav/Minor-Skin-Cancer-Project/blob/main/RinColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
install.packages(c("data.table", "ggplot2", "igraph", "gtools", "cluster", "factoextra"))

# Load libraries
library(data.table)
library(ggplot2)
library(igraph)
library(gtools)
library(cluster)
library(factoextra)

# Define input/output paths
input_folder <- "/content/sample_data/merged"
output_folder <- "/content/sample_data/Analysis_Results"
dir.create(output_folder, showWarnings = FALSE)

# Load all CSV files
file_list <- list.files(input_folder, pattern = "*.csv", full.names = TRUE)

# Initialize empty list to store data
all_data <- list()

# Read and clean data
for (file in file_list) {
    df <- fread(file)

    # Ensure correct column names
    if (!all(c("HGNC_Symbol", "log2R_CNA", "Z_Score") %in% colnames(df))) {
        next  # Skip files with incorrect format
    }

    # Remove duplicates
    df <- unique(df)

    # Handle missing values (impute with median)
    df[, log2R_CNA := ifelse(is.na(log2R_CNA), median(log2R_CNA, na.rm = TRUE), log2R_CNA)]
    df[, Z_Score := ifelse(is.na(Z_Score), median(Z_Score, na.rm = TRUE), Z_Score)]

    # Normalize values
    df[, log2R_CNA := scale(log2R_CNA)]
    df[, Z_Score := scale(Z_Score)]

    # Store in list
    patient_id <- tools::file_path_sans_ext(basename(file))
    all_data[[patient_id]] <- df
}

# Convert list to combined data table
combined_data <- rbindlist(all_data, idcol = "Patient_ID")

# Compute variance & standard deviation
variance_stats <- combined_data[, .(
    Mean_CNA = mean(log2R_CNA, na.rm = TRUE),
    SD_CNA = sd(log2R_CNA, na.rm = TRUE),
    Mean_Exp = mean(Z_Score, na.rm = TRUE),
    SD_Exp = sd(Z_Score, na.rm = TRUE)
), by = HGNC_Symbol]

# Save variance statistics
fwrite(variance_stats, file.path(output_folder, "variance_statistics.csv"))

# Visualize variance distribution
png(file.path(output_folder, "variance_distribution.png"), width = 800, height = 600)
ggplot(variance_stats, aes(x = SD_CNA, y = SD_Exp)) +
    geom_point(alpha = 0.6) +
    labs(title = "Variance Distribution: CNA vs Expression",
         x = "CNA Standard Deviation",
         y = "Expression Standard Deviation") +
    theme_minimal()
dev.off()

# Network Analysis: Build gene interaction graph
graph_data <- combined_data[, .(
    Correlation = cor(log2R_CNA, Z_Score, use = "complete.obs")
), by = HGNC_Symbol]

# Filter significant interactions
graph_data <- graph_data[abs(Correlation) > 0.5]

# Create graph
gene_graph <- graph_from_data_frame(graph_data, directed = FALSE)
png(file.path(output_folder, "gene_network.png"), width = 800, height = 600)
plot(gene_graph, vertex.size = 5, vertex.label.cex = 0.7, main = "Gene Interaction Network")
dev.off()

# Clustering patients based on CNA-Expression profiles
patient_matrix <- dcast(combined_data, Patient_ID ~ HGNC_Symbol, value.var = "log2R_CNA")
patient_matrix <- as.matrix(patient_matrix[, -1, with = FALSE])  # Remove Patient_ID column

# Check for NA values
if (any(is.na(patient_matrix))) {
    patient_matrix[is.na(patient_matrix)] <- 0
}

# Perform hierarchical clustering
dist_matrix <- dist(patient_matrix, method = "euclidean")
hc <- hclust(dist_matrix, method = "ward.D2")

# Save dendrogram
png(file.path(output_folder, "patient_clustering.png"), width = 800, height = 600)
plot(hc, main = "Hierarchical Clustering of Patients", xlab = "", sub = "")
dev.off()

# Save final processed dataset
fwrite(combined_data, file.path(output_folder, "final_processed_data.csv"))


Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘rbibutils’, ‘Deriv’, ‘microbenchmark’, ‘Rdpack’, ‘doBy’, ‘SparseM’, ‘MatrixModels’, ‘minqa’, ‘nloptr’, ‘reformulas’, ‘RcppEigen’, ‘lazyeval’, ‘carData’, ‘Formula’, ‘pbkrtest’, ‘quantreg’, ‘lme4’, ‘crosstalk’, ‘estimability’, ‘numDeriv’, ‘mvtnorm’, ‘corrplot’, ‘viridis’, ‘car’, ‘DT’, ‘ellipse’, ‘emmeans’, ‘flashClust’, ‘leaps’, ‘multcompView’, ‘scatterplot3d’, ‘ggsci’, ‘cowplot’, ‘ggsignif’, ‘gridExtra’, ‘polynom’, ‘rstatix’, ‘plyr’, ‘abind’, ‘dendextend’, ‘FactoMineR’, ‘ggpubr’, ‘reshape2’, ‘ggrepel’



Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘gtools’


The following object is masked from ‘package:igraph’:

    permute


Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa



In [2]:
# Install necessary packages if not installed
install.packages("dplyr", dependencies=TRUE)

# Load required library
library(dplyr)

# Set file paths for Google Colab
input_file <- "/content/sample_data/variance_statistics.csv"
output_file <- "/content/sample_data/gene_variability_categories_2.csv"

# Read the variance statistics file
data <- read.csv(input_file, stringsAsFactors = FALSE)

# Remove missing values (if any)
data <- na.omit(data)

# Define percentile-based categories
categorize <- function(values) {
  quantiles <- quantile(values, probs = c(0.33, 0.66), na.rm = TRUE)
  cut(values, breaks = c(-Inf, quantiles[1], quantiles[2], Inf),
      labels = c("Low", "Moderate", "High"), include.lowest = TRUE)
}

# Apply categorization
data <- data %>%
  mutate(
    CNA_Category = categorize(SD_CNA),
    Exp_Category = categorize(SD_Exp)
  )

# Save the output file
write.csv(data, output_file, row.names = FALSE)

print("Gene variability file successfully generated!")


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



[1] "Gene variability file successfully generated!"
