In [5]:
setwd("C:\\Users\\Ermias\\Documents\\data visualization\\R_project")


In [36]:
# --- Load Data ---
expr_matrix <- read.delim("GSM1172856_CAMA1.txt", header = TRUE, stringsAsFactors = FALSE, na.strings = c("", "NA"))
expr_matrix <- read.delim("GSM1172856_CAMA1.txt", sep = "\t", header = TRUE, row.names = 1, stringsAsFactors = FALSE, skip = 1)

annotation_df <- read.delim("SraRunTable_GSE48216.pam50.txt", header = TRUE, stringsAsFactors = FALSE)
expr_matrix[is.na(expr_matrix)] <- 0

# --- Clean Annotation ---
# Split X.Group column to extract Run and Group
split_group <- strsplit(as.character(annotation_df$X.Group), ",")
annotation_clean <- do.call(rbind, split_group)
colnames(annotation_clean) <- c("Run", "Group")
annotation_clean <- as.data.frame(annotation_clean, stringsAsFactors = FALSE)

# --- Match Sample ---
sample_ids <- "GSM1172856"  # Update if you have more samples
matched_anno <- annotation_clean[match(sample_ids, annotation_clean$Run), , drop = FALSE]
annotation_vector <- matched_anno$Group

# --- Check validity ---
if (length(unique(annotation_vector)) < 1 || is.na(annotation_vector)) {
  stop("❌ Invalid or missing group label for the sample.")
}

# --- Filter Genes (Top 1000 by Variance) ---
if (ncol(expr_matrix) < 2) {
  # For single sample, gene filtering is still valid based on expression variability
  gene_var <- apply(expr_matrix, 1, var)
  top_genes <- names(sort(gene_var, decreasing = TRUE))[1:min(1000, length(gene_var))]
  expr_matrix <- expr_matrix[top_genes, , drop = FALSE]
} else {
  stop("⚠️ This script assumes a single sample is compared to a larger set. Please combine datasets if needed.")
}

# --- Distance Matrix ---
dist_matrix <- as.matrix(dist(t(expr_matrix), method = "euclidean"))

# --- Hobotnica Functions ---
Hobotnica <- function(distMatrix, annotation){
  annotation <- as.vector(annotation)
  rank.m <- as.matrix(distMatrix)
  rank.m[lower.tri(rank.m)] <- rank(rank.m[lower.tri(rank.m)])
  rank.m[upper.tri(rank.m)] <- rank(rank.m[upper.tri(rank.m)])
  inclass_sum <- 0
  classes <- unique(annotation)
  Ns <- numeric(length(classes))
  for (i in seq_along(classes)) {
    class_samples <- which(annotation == classes[i])
    Ns[i] <- length(class_samples)
    inclass_sum <- inclass_sum + sum(rank.m[class_samples, class_samples])
  }
  Ns_sum <- sum(Ns)
  if (Ns_sum <= 1) return(NA)
  biggest_bossible_rank <- Ns_sum * (Ns_sum - 1) / 2
  number_of_unique_inclass_elements <- sum(Ns * (Ns - 1)) / 2
  maximal_value <- number_of_unique_inclass_elements * 
    (2 * biggest_bossible_rank - number_of_unique_inclass_elements + 1)
  minimal_value <- number_of_unique_inclass_elements * 
    (1 + number_of_unique_inclass_elements)
  normalization_factor <- maximal_value - minimal_value
  return(max(0, 1 - (inclass_sum - minimal_value) / normalization_factor))
}

Hobot_distr <- function(N, distMatrix, annotation){
  hobots <- numeric(N)
  for (i in 1:N){
    permuted <- sample(annotation)
    hobots[i] <- Hobotnica(distMatrix, permuted)
  }
  return(hobots)
}

Hobot_pval <- function(Test_hobot, Hobots){
  if (is.na(Test_hobot)) return(NA)
  return(mean(Hobots <= Test_hobot))
}

# --- Run Hobotnica ---
set.seed(42)
H_score <- Hobotnica(dist_matrix, annotation_vector)
null_dist <- Hobot_distr(1000, dist_matrix, annotation_vector)
p_val <- Hobot_pval(H_score, null_dist)

# --- Output ---
cat("✅ Hobotnica H-score:", round(H_score, 4), "\n")
cat("✅ P-value:", format.pval(p_val, digits = 4), "\n")



✅ Hobotnica H-score: NA 
✅ P-value: NA 


In [17]:
grep("GSM1172856", annotation_clean$Run)



In [20]:
sample_ids <- c("GSM1172882")  # Use the actual GEO ID

# Match sample IDs to annotation data
matched_annotation <- annotation_clean[match(sample_ids, annotation_clean$Run), , drop = FALSE]

# Extract group labels
annotation_vector <- matched_annotation$Group


In [21]:
print(unique(annotation_vector))


[1] "Non-malignant"


In [22]:
set.seed(42)
H_score <- Hobotnica(dist_matrix, annotation_vector)
null_dist <- Hobot_distr(1000, dist_matrix, annotation_vector)
p_val <- Hobot_pval(H_score, null_dist)

cat("Hobotnica H-score:", H_score, "\n")
cat("P-value:", p_val, "\n")


Hobotnica H-score: NA 
P-value: NA 


In [1]:
getwd()