In [14]:
# Data Preprocessing
library(tidyverse)
library(caret)

# Import the dataset
dataset <- read.csv('data/raw_data.csv')

# Understand the dimension, first few rows of the dataset, and its structure
dim(dataset)
head(dataset)
str(dataset)

Unnamed: 0_level_0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,⋯,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,⋯,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.9342,48.14165,15.347
2,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,⋯,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.9342,48.14165,15.347
3,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,⋯,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.0048,48.13413,15.436
4,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,⋯,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.5346,48.28521,15.597
5,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,⋯,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.7549,48.2262,15.509
6,6,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.0,0,0,0,⋯,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.2861,48.22467,15.714


'data.frame':	9564 obs. of  50 variables:
 $ rowid            : int  1 2 3 4 5 6 7 8 9 10 ...
 $ kepid            : int  10797460 10797460 10811496 10848459 10854555 10872983 10872983 10872983 6721123 10910878 ...
 $ kepoi_name       : chr  "K00752.01" "K00752.02" "K00753.01" "K00754.01" ...
 $ kepler_name      : chr  "Kepler-227 b" "Kepler-227 c" "" "" ...
 $ koi_disposition  : chr  "CONFIRMED" "CONFIRMED" "FALSE POSITIVE" "FALSE POSITIVE" ...
 $ koi_pdisposition : chr  "CANDIDATE" "CANDIDATE" "FALSE POSITIVE" "FALSE POSITIVE" ...
 $ koi_score        : num  1 0.969 0 0 1 1 1 0.992 0 1 ...
 $ koi_fpflag_nt    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ koi_fpflag_ss    : int  0 0 1 1 0 0 0 0 1 0 ...
 $ koi_fpflag_co    : int  0 0 0 0 0 0 0 0 1 0 ...
 $ koi_fpflag_ec    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ koi_period       : num  9.49 54.42 19.9 1.74 2.53 ...
 $ koi_period_err1  : num  2.78e-05 2.48e-04 1.49e-05 2.63e-07 3.76e-06 ...
 $ koi_period_err2  : num  -2.78e-05 -2.48e-04 -1.49e-05 -2.63e-07

In [15]:
# Get a summary of the dataset statistics
summary(dataset)

     rowid          kepid           kepoi_name        kepler_name       
 Min.   :   1   Min.   :  757450   Length:9564        Length:9564       
 1st Qu.:2392   1st Qu.: 5556034   Class :character   Class :character  
 Median :4782   Median : 7906892   Mode  :character   Mode  :character  
 Mean   :4782   Mean   : 7690628                                        
 3rd Qu.:7173   3rd Qu.: 9873067                                        
 Max.   :9564   Max.   :12935144                                        
                                                                        
 koi_disposition    koi_pdisposition     koi_score      koi_fpflag_nt   
 Length:9564        Length:9564        Min.   :0.0000   Min.   :0.0000  
 Class :character   Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
 Mode  :character   Mode  :character   Median :0.3340   Median :0.0000  
                                       Mean   :0.4808   Mean   :0.1882  
                                       3rd Qu.:0.99

In [16]:
# Remove irrelevant columns (ie. ID, names, disposition, and score columns)
dataset <- dataset [, !(names(dataset) %in% c('rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score'))]

In [17]:
# Check for missing values
colSums(is.na(dataset))

In [18]:
# Handle missing values by calculating the median for numerical columns
dataset <- dataset %>% mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

In [19]:
# Only keep the confirmed and candidate planets
dataset <- dataset %>% filter(koi_disposition %in% c('CONFIRMED', 'CANDIDATE'))

In [20]:
# Transform target column to binary (1 = CANDIDATE, 0 = CONFIRMED)
dataset$koi_disposition <- ifelse(dataset$koi_disposition == "CANDIDATE", 1, 0)

In [21]:
# Handle categorical variables by creating dummy variables
if("koi_tce_delivname" %in% names(dataset)) {
  # For categorical variable, use mode (most frequent value) instead of mean
  # Calculate mode
  mode_value <- names(sort(table(dataset$koi_tce_delivname), decreasing = TRUE))[1]
  
  # Replace NAs with mode
  dataset$koi_tce_delivname[is.na(dataset$koi_tce_delivname)] <- mode_value
  
  # Create dummy variables for koi_tce_delivname
  dummy_model <- dummyVars(~ koi_tce_delivname, data = dataset, fullRank = TRUE)
  dummy_vars <- predict(dummy_model, dataset)
  
  # Remove original categorical column and add dummy variables
  dataset <- dataset[, names(dataset) != "koi_tce_delivname"]
  dataset <- cbind(dataset, dummy_vars)
}

In [22]:
# Remove columns that are completely NA
dataset <- dataset %>% select(where(~!all(is.na(.))))

In [23]:
# Impute remaining missing values in numeric columns (median imputation)
# Separate target variable
y <- dataset$koi_disposition
X <- dataset[, names(dataset) != "koi_disposition"]

# Impute missing values in X
for(col in names(X)) {
  if(is.numeric(X[[col]]) && any(is.na(X[[col]]))) {
    X[[col]][is.na(X[[col]])] <- median(X[[col]], na.rm = TRUE)
  }
}

# Verify no missing values remain
cat("Missing values remaining in X:", sum(is.na(X)), "\n")

Missing values remaining in X: 0 


In [24]:
# Split into training (70%) and testing (30%) sets
train_indices <- createDataPartition(y, p = 0.70, list = FALSE)

X_train <- X[train_indices, ]
X_test <- X[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]

# Check dimensions
cat("Training set size:", nrow(X_train), "\n")
cat("Test set size:", nrow(X_test), "\n")
cat("Training target distribution:\n")
print(table(y_train))
cat("Test target distribution:\n")
print(table(y_test))

Training set size: 3179 
Test set size: 1362 
Training target distribution:
y_train
   0    1 
1610 1569 
Test target distribution:
y_test
  0   1 
683 679 


In [25]:
# Scale the data using StandardScaler (center and scale)
# Learn scaling parameters from training data only
preProc_scale <- preProcess(X_train, method = c("center", "scale"))

# Apply scaling to both training and test sets
X_train_scaled <- predict(preProc_scale, X_train)
X_test_scaled <- predict(preProc_scale, X_test)

# Verify scaling worked (mean should be ~0, sd should be ~1 for training data)
cat("\nSample column statistics after scaling (training data):\n")
cat("Mean of first numeric column:", mean(X_train_scaled[[1]]), "\n")
cat("SD of first numeric column:", sd(X_train_scaled[[1]]), "\n")

# Final datasets ready for modeling
cat("\n=== Preprocessing Complete ===\n")
cat("X_train_scaled: ", nrow(X_train_scaled), "rows x", ncol(X_train_scaled), "columns\n")
cat("X_test_scaled: ", nrow(X_test_scaled), "rows x", ncol(X_test_scaled), "columns\n")
cat("y_train: ", length(y_train), "values\n")
cat("y_test: ", length(y_test), "values\n")


Sample column statistics after scaling (training data):
Mean of first numeric column: 5.43907e-18 
SD of first numeric column: 1 

=== Preprocessing Complete ===
X_train_scaled:  3179 rows x 43 columns
X_test_scaled:  1362 rows x 43 columns
y_train:  3179 values
y_test:  1362 values


In [26]:
# Save processed data to CSV files
write.csv(cbind(X_train_scaled, koi_disposition = y_train), 'data/processed_train_data.csv', row.names = FALSE)
write.csv(cbind(X_test_scaled, koi_disposition = y_test), 'data/processed_test_data.csv', row.names = FALSE)

In [27]:
# Save training data
write.csv(X_train_scaled, "data/X_train_scaled.csv", row.names = FALSE)
write.csv(X_test_scaled, "data/X_test_scaled.csv", row.names = FALSE)

# Save target variables
write.csv(data.frame(koi_disposition = y_train), "data/y_train.csv", row.names = FALSE)
write.csv(data.frame(koi_disposition = y_test), "data/y_test.csv", row.names = FALSE)