<a href="https://colab.research.google.com/github/DivyaMeenaSundaram/R-progrmming/blob/main/Impact_of_Missing_Data_on_Model_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -------------------------------------------------------
# Impact of Missing Data on Model Accuracy (NO caret used)
# -------------------------------------------------------

library(dplyr)

set.seed(123)

# -----------------------------------------
# 1. CREATE SIMPLE HEALTHCARE DATASET
# -----------------------------------------

patient <- data.frame(
  Age = round(runif(200, 30, 85)),
  Treatment = sample(c("DrugA", "DrugB"), 200, replace = TRUE),
  Health_Score = round(runif(200, 1, 10)),
  Status = sample(c("Alive", "Deceased"), 200, replace = TRUE)
)

patient$Status <- as.factor(patient$Status)
patient$Treatment <- as.factor(patient$Treatment)

# -----------------------------------------
# 2. FUNCTION TO INTRODUCE MISSING DATA
# -----------------------------------------
introduce_missing <- function(df, missing_percent) {
  df_missing <- df
  total_values <- prod(dim(df))
  num_missing <- round(total_values * missing_percent)

  missing_idx <- arrayInd(sample(total_values, num_missing), dim(df))

  for (i in 1:nrow(missing_idx)) {
    df_missing[missing_idx[i,1], missing_idx[i,2]] <- NA
  }
  return(df_missing)
}

# -----------------------------------------
# 3. FUNCTION: TRAIN MODEL + COMPUTE ACCURACY
# -----------------------------------------
get_accuracy <- function(df) {

  # Impute numeric columns using mean
  for (col in names(df)) {
    if (is.numeric(df[[col]])) {
      df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
    }
  }

  # Remove rows where Status is missing (if any)
  df <- df %>% filter(!is.na(Status))

  # Train-test split (70â€“30)
  set.seed(123)
  idx <- sample(1:nrow(df), 0.7 * nrow(df))
  train <- df[idx, ]
  test <- df[-idx, ]

  # Logistic Regression Model
  model <- glm(Status ~ Age + Treatment + Health_Score,
               data = train, family = binomial)

  # Predict on test data
  pred_prob <- predict(model, test, type = "response")
  preds <- ifelse(pred_prob > 0.5, "Deceased", "Alive")

  # Accuracy
  accuracy <- mean(preds == test$Status)
  return(accuracy)
}

# -----------------------------------------
# 4. CREATE MISSING DATA LEVELS
# -----------------------------------------
data_10 <- introduce_missing(patient, 0.10)
data_30 <- introduce_missing(patient, 0.30)
data_50 <- introduce_missing(patient, 0.50)

# -----------------------------------------
# 5. EVALUATE MODEL ACCURACY
# -----------------------------------------
acc_original <- get_accuracy(patient)
acc_10 <- get_accuracy(data_10)
acc_30 <- get_accuracy(data_30)
acc_50 <- get_accuracy(data_50)

# Print results
cat("Model Accuracy (0% Missing):  ", acc_original, "\n")
cat("Model Accuracy (10% Missing): ", acc_10, "\n")
cat("Model Accuracy (30% Missing): ", acc_30, "\n")
cat("Model Accuracy (50% Missing): ", acc_50, "\n")

# -----------------------------------------
# 6. Results Table
# -----------------------------------------
results <- data.frame(
  Missing_Level = c("0%", "10%", "30%", "50%"),
  Accuracy = c(acc_original, acc_10, acc_30, acc_50)
)

print(results)


Model Accuracy (0% Missing):   0.3833333 
Model Accuracy (10% Missing):  NA 
Model Accuracy (30% Missing):  NA 
Model Accuracy (50% Missing):  NA 
  Missing_Level  Accuracy
1            0% 0.3833333
2           10%        NA
3           30%        NA
4           50%        NA
