In [30]:
# library(wordnet)
library(text2vec)
library(stringr)
library(topicmodels)
library(caret)
library(glmnet)
library(kernlab)
library(rpart)
library(gbm)
library(randomForest)
library(tidyverse)
library(MLmetrics)

In [31]:
#' Label Mapping Function

create_label_mapping <- function() {
  # Create mapping from original labels to valid R variable names
  label_map <- c(
    "0" = "correct",
    "1" = "contradictory",
    "2" = "partially_correct",
    "3" = "irrelevant",
    "4" = "non_domain"
  )
  return(label_map)
}

In [32]:
#' Reverse Label Mapping Function

reverse_label_mapping <- function() {
  # Create reverse mapping from valid R variable names to original labels
  label_map <- c(
    "correct" = 0,
    "contradictory" = 1,
    "partially_correct" = 2,
    "irrelevant" = 3,
    "non_domain" = 4
  )
  return(label_map)
}


# Feature Engineering Functions

## 1. Semantic Similarity Features

In [34]:
initDict()


compute_wordnet_similarity <- function(text1, text2, measure) {
  # Tokenize texts
  words1 <- unlist(strsplit(tolower(text1), "\\W+"))
  words2 <- unlist(strsplit(tolower(text2), "\\W+"))
  
  # Get synsets for each word
  synsets1 <- lapply(words1, function(w) getSynsets(w, "NOUN"))
  synsets2 <- lapply(words2, function(w) getSynsets(w, "NOUN"))
  
  # Compute similarities based on measure
  similarities <- matrix(NA, length(synsets1), length(synsets2))
  for(i in seq_along(synsets1)) {
    for(j in seq_along(synsets2)) {
      if(length(synsets1[[i]]) > 0 && length(synsets2[[j]]) > 0) {
        sim <- switch(measure,
                     "path" = getPathSimilarity(synsets1[[i]][[1]], synsets2[[j]][[1]]),
                     "lch" = getLCHSimilarity(synsets1[[i]][[1]], synsets2[[j]][[1]]),
                     "wup" = getWUPSimilarity(synsets1[[i]][[1]], synsets2[[j]][[1]]),
                     # Add other WordNet measures here
                     0)
        similarities[i,j] <- sim
      }
    }
  }
  
  # Return average similarity
  mean(similarities, na.rm = TRUE)
}

## 2. Lexical Overlap Features

In [33]:
compute_lexical_overlap <- function(text1, text2) {
  # Tokenize texts
  words1 <- unlist(strsplit(tolower(text1), "\\W+"))
  words2 <- unlist(strsplit(tolower(text2), "\\W+"))
  
  # Jaccard similarity
  intersection <- length(intersect(words1, words2))
  union <- length(unique(c(words1, words2)))
  jaccard <- if(union > 0) intersection / union else 0
  
  # Simple word overlap
  simple_overlap <- if(min(length(words1), length(words2)) > 0) 
    intersection / min(length(words1), length(words2)) else 0
  
  # Return as named vector
  return(c(jaccard = jaccard, simple_overlap = simple_overlap))
}

## 3. TF-IDF Features

In [35]:
compute_tfidf <- function(corpus) {
  # Create vocabulary
  it <- itoken(corpus, preprocessor = tolower, 
              tokenizer = word_tokenizer)
  vocab <- create_vocabulary(it)
  
  # Prune vocabulary
  vocab <- prune_vocabulary(vocab, term_count_min = 3)
  
  # Create document-term matrix
  vectorizer <- vocab_vectorizer(vocab)
  dtm <- create_dtm(it, vectorizer)
  
  # Compute TF-IDF
  tfidf <- TfIdf$new()
  dtm_tfidf <- fit_transform(dtm, tfidf)
  
  # Convert to dense matrix and then to data frame
  dense_matrix <- as.matrix(dtm_tfidf)
  
  # Limit number of features to prevent memory issues
  if(ncol(dense_matrix) > 100) {
    dense_matrix <- dense_matrix[, 1:100]
  }
  
  # Name columns
  colnames(dense_matrix) <- paste0("tfidf_", 1:ncol(dense_matrix))
  
  return(as.data.frame(dense_matrix))
}

## 4. Topic Modeling Features

In [36]:
compute_lda_features <- function(dtm, k = 10) {
  # Fit LDA model
  lda_model <- LDA(dtm, k = k)
  
  # Get document-topic distributions
  doc_topics <- posterior(lda_model)$topics
  
  return(doc_topics)
}

## 5. LSA Features

In [37]:
compute_lsa_features <- function(dtm, dims = 100) {
  # Perform SVD
  svd_model <- irlba::irlba(dtm, nv = dims)
  
  # Get document embeddings
  doc_embeddings <- dtm %*% svd_model$v
  
  return(doc_embeddings)
}

## 6. Top Scorer Features

In [38]:
compute_top_scorer_features <- function(student_answer, reference_answer, 
                                      top_answers, p = 5) {
  # Combine reference answer with top P answers
  enhanced_reference <- paste(c(reference_answer, 
                              head(top_answers, p)), 
                            collapse = " ")
  
  # Compute LSA similarity with enhanced reference
  lsa_sim <- cosine(as.matrix(student_answer), 
                    as.matrix(enhanced_reference))
  
  return(lsa_sim)
}

# Main Feature Extraction Function

In [50]:
extract_features <- function(data) {
  # Initialize matrix for lexical features
  n_samples <- nrow(data)
  lex_features <- matrix(0, nrow = n_samples, ncol = 2)
  colnames(lex_features) <- c("jaccard", "simple_overlap")
  
  # Compute lexical overlap features
  for(i in 1:n_samples) {
    lex_features[i,] <- compute_lexical_overlap(
      data$reference_answer[i], 
      data$student_answer[i]
    )
  }
  
  # Convert lexical features to data frame
  features_df <- as.data.frame(lex_features)
  
  # Compute and add TF-IDF features
  tfidf_features <- compute_tfidf(data$student_answer)
  
  # Combine features
  features_df <- cbind(features_df, tfidf_features)
  
  # Replace any NAs with 0
  features_df[is.na(features_df)] <- 0
  
  return(features_df)
}

# Model Training Functions

In [49]:
train_base_models <- function(features, labels) {
  # Set up cross-validation
  ctrl <- trainControl(method = "cv", 
                      number = 5,
                      classProbs = TRUE)
  
  # Train base models with error handling
  models <- list()
  
  tryCatch({
    models$tree <- train(x = features, y = labels, 
                        method = "rpart", trControl = ctrl)
  }, error = function(e) {
    warning("Error in training tree model: ", e$message)
  })
  
  tryCatch({
    models$rf <- train(x = features, y = labels, 
                      method = "rf", 
                      trControl = ctrl,
                      ntree = 50)  # Reduced for speed
  }, error = function(e) {
    warning("Error in training RF model: ", e$message)
  })
  
  tryCatch({
    models$svm <- train(x = features, y = labels, 
                       method = "svmLinear", 
                       trControl = ctrl)
  }, error = function(e) {
    warning("Error in training SVM model: ", e$message)
  })
  
  return(models)
}

# Stacked Ensemble Function

In [41]:
train_stacked_ensemble <- function(base_predictions, labels) {
  # Combine base model predictions
  meta_features <- as.data.frame(base_predictions)
  
  # Train meta-learner (using elastic net)
  meta_model <- train(x = meta_features, y = labels,
                     method = "glmnet",
                     trControl = trainControl(method = "cv", number = 5),
                     tuneGrid = expand.grid(alpha = seq(0, 1, 0.1),
                                          lambda = seq(0.0001, 1, length = 20)))
  
  return(meta_model)
}

# Evaluation Function

In [42]:
evaluate_predictions <- function(predictions, actual) {
  # Convert factors to numeric if necessary
  if(is.factor(predictions)) {
    predictions <- as.numeric(as.character(predictions))
  }
  if(is.factor(actual)) {
    actual <- as.numeric(as.character(actual))
  }
  
  # Calculate metrics
  rmse <- sqrt(mean((predictions - actual)^2))
  
  # Calculate F1 score (treating as multi-class classification)
  f1 <- F1_Score(y_pred = predictions, 
                 y_true = actual, 
                 positive = unique(actual))
  
  return(list(
    RMSE = rmse,
    F1 = f1
  ))
}

In [44]:
# options(warn=-1)

# Main Function

In [51]:
asag_system <- function(train_data, test_data) {
  # Map labels to valid R variable names
  label_map <- create_label_mapping()
  reverse_map <- reverse_label_mapping()
  
  # Convert labels in training and test data
  train_data$label <- as.factor(label_map[as.character(train_data$label)])
  test_data$label <- as.factor(label_map[as.character(test_data$label)])
  
  # Extract features
  train_features <- extract_features(train_data)
  test_features <- extract_features(test_data)
  
  # Ensure feature matrices have same columns
  common_cols <- intersect(colnames(train_features), colnames(test_features))
  train_features <- train_features[, common_cols]
  test_features <- test_features[, common_cols]
  
  # Train base models
  base_models <- train_base_models(train_features, train_data$label)
  
  # Get base model predictions
  base_predictions <- lapply(base_models, predict, newdata = train_features)
  base_predictions_test <- lapply(base_models, predict, newdata = test_features)
  
  # Train stacked ensemble
  ensemble_model <- train_stacked_ensemble(do.call(cbind, base_predictions), 
                                         train_data$label)
  
  # Make final predictions
  final_predictions <- predict(ensemble_model, 
                             newdata = as.data.frame(do.call(cbind, 
                                                           base_predictions_test)))
  
  # Convert predictions back to original numeric labels
  final_predictions_numeric <- as.numeric(as.character(reverse_map[final_predictions]))
  actual_numeric <- as.numeric(as.character(reverse_map[test_data$label]))
  
  # Evaluate predictions
  evaluation <- evaluate_predictions(final_predictions_numeric, actual_numeric)
  
  return(list(
    base_models = base_models,
    ensemble_model = ensemble_model,
    predictions = final_predictions_numeric,
    evaluation = evaluation
  ))
}

# Load and prepare data
data <- read.csv("/kaggle/input/asag-data/train.csv")

# Create train-test split (80-20)
set.seed(42)
train_index <- createDataPartition(data$label, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

# Run the ASAG system
result <- asag_system(train_data, test_data)

# print("Evaluation Metrics:")
# print(paste("RMSE:", result$evaluation$RMSE))
# print(paste("F1 Score:", result$evaluation$F1))

# # Print confusion matrix
# conf_matrix <- table(Predicted = result$predictions, 
#                     Actual = test_data$label)
# print("Confusion Matrix:")
# print(conf_matrix)

maximum number of iterations reached -8.436034e-05 -8.436536e-05maximum number of iterations reached 0.003168019 0.003080169maximum number of iterations reached 0.0004422935 0.0004422443maximum number of iterations reached -0.0001451932 0.0001452014maximum number of iterations reached 0.00098097 0.0009786619maximum number of iterations reached 0.001846696 0.00182968maximum number of iterations reached 0.0008518131 -0.000851444maximum number of iterations reached 0.004379824 0.004263404maximum number of iterations reached 0.0005451275 -0.0005448291maximum number of iterations reached 0.0005702644 0.0005701718maximum number of iterations reached 0.002266109 0.002255546maximum number of iterations reached 1.554261e-05 1.554358e-05maximum number of iterations reached 0.001285683 -0.00128454maximum number of iterations reached 0.001689908 0.001684898maximum number of iterations reached 0.0003761772 0.0003761393maximum number of iterations reached 0.0009771355 -0.0009768019maximum number of 

In [52]:
# Print evaluation metrics

print("Evaluation Metrics:")
print(paste("RMSE:", result$evaluation$RMSE))
print(paste("F1 Score:", result$evaluation$F1))

# Print confusion matrix
conf_matrix <- table(Predicted = result$predictions, 
                    Actual = test_data$label)
print("Confusion Matrix:")
print(conf_matrix)

[1] "Evaluation Metrics:"
[1] "RMSE: 1.75086384209292"
[1] "F1 Score: 0.772667542706964"
[1] "Confusion Matrix:"
         Actual
Predicted   0   1   2   3   4
        1 294  46 158  83   1
        2  49  23  41  99   2
        4  59  30  65  42   0
