In [None]:
library(ggplot2)
library(GGally)
library(rpart)
library(e1071)



In [None]:
data <- read.csv('/kaggle/input/boston-house-price-data/boston.csv', row.names=1)


**Crime Rate (CRIM): The average number of crimes per person in a town.**

**Residential Zone (ZN): The proportion of land designated for large residential lots over 25000 sq ft.**

**Business Proportion (INDUS): The proportion of non-retail business space in the town.**

**Proximity to River (CHAS): Whether or not the property is near the Charles River (1 for yes, 0 for no).**

**Air Quality (NOX): Concentration of harmful nitric oxides in the air.**

**Average Rooms (RM): The average number of rooms per house in the town.**

**Age of Houses (AGE): Proportion of houses built before 1940 that are still occupied by their owners.**

**Distance to Employment (DIS): Average distance to five major employment centers in Boston.**

**Highway Accessibility (RAD): How easily accessible the town is to radial highways.**

**Property Tax (TAX): The property tax rate per $10,000 valuation.**

**Student-Teacher Ratio (PTRATIO): The ratio of students to teachers in the town's schools.**

**Ethnicity Factor (B): A derived factor influenced by the proportion of Black residents in the town.**

**Lower Status Population (LSTAT): Percentage of the population with lower socioeconomic status.**

**Median Home Price (PRICE): The median value of homes that are occupied by their owners, measured in thousands of dollars.**


original research paper: https://deepblue.lib.umich.edu/bitstream/handle/2027.42/22636/0000186.pdf?sequence=1&isAllowed=y

In [None]:
shape <- dim(data)
cat("Shape of data:", shape[1], "rows and", shape[2], "columns.\n")

cat("\n","Column names of data:", names(data), "\n")

any_na <- any(is.na(data))
cat("\n","Any NaN values in data:", any_na, "\n")

any_duplicates <- any(duplicated(data))
cat("\n","Any duplicated values in data:", any_duplicates, "\n")


In [None]:
cat("Basic information about the data:\n")


cat("Structure of the data:\n")
str(data)


In [None]:
cat("Summary statistics of the data:\n")
summary(data)

In [None]:
# Initialize a data frame to store outliers
outliers_df <- data.frame(Column = character(), Outliers = character(), stringsAsFactors = FALSE)

# Loop through each column
for (col in names(data)) {
  # Calculate the IQR
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1

  # Define lower and upper bounds for outliers
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR

  # Check for outliers
  outliers <- data[[col]][data[[col]] < lower_bound | data[[col]] > upper_bound]

  # If outliers are found, add them to the outliers data frame
  if (length(outliers) > 0) {
    outliers_df <- rbind(outliers_df, data.frame(Column = col, Outliers = paste(outliers, collapse = ", "), stringsAsFactors = FALSE))
  }
}

# Print the outliers in a table
if (nrow(outliers_df) == 0) {
  cat("No outliers found in any column.\n")
} else {
  library(knitr)
  knitr::kable(outliers_df, caption = "Outliers in the Dataset")
}


In [None]:
# Calculate the IQR for each column
Q1 <- apply(data, 2, quantile, probs = 0.25)
Q3 <- apply(data, 2, quantile, probs = 0.75)
IQR <- Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Identify rows with outliers in any column
outlier_rows <- apply(data, 1, function(row) any(row < lower_bound | row > upper_bound))

# Remove outliers by subsetting the data
clean_data <- data[!outlier_rows, ]

# Display the dimensions of the original and cleaned datasets
cat("Original dataset dimensions:", dim(data), "\n")
cat("Cleaned dataset dimensions:", dim(clean_data), "\n")


In [None]:
cat("First few rows of the data:\n")
head(data)


# DATA TRANSFORMATION TECHNIQUES


 # Standardization

In [None]:

# Standardization function

# standardize <- function(x) {
#   (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
# }

# standardized_data <- as.data.frame(lapply(data, function(x) {
#   if(is.numeric(x)) {
#     standardize(x)
#   } else {
#     x
#   }
# }))
# standardized_data$CHAS <- data$CHAS
#Outliers=238

# Normalization


In [None]:

# # Normalization function
# normalize <- function(x) {
#   (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
# }

# # Apply normalization to numeric columns in the data frame
# normalized_data <- as.data.frame(lapply(data, function(x) {
#   if(is.numeric(x)) {
#     normalize(x)
#   } else {
#     x
#   }
# }))
# normalized_data$CHAS <- data$CHAS
# Outliers=238



# Log Transformation

In [None]:
log_transformed_data <- data.frame(CHAS = data$CHAS)
for (column in names(data)) {
  if (column != 'CHAS') {
    log_transformed_data[, column] <- ifelse(data[, column] == 0, 0.00000001, log(data[, column]))
  }
}
#Outliers=174


# Winsorization

In [None]:
# winsorize <- function(x, winsor_value) {
#   # Calculate the winsorized_data values
#   winsorized_data_values <- ifelse(x > quantile(x, 1 - winsor_value),
#                             quantile(x, 1 - winsor_value),
#                             ifelse(x < quantile(x, winsor_value),
#                                    quantile(x, winsor_value),
#                                    x))
#   return(winsorized_data_values)
# }

# # Winsorize the data
# winsorized_data <- data.frame(CHAS = data$CHAS)
# for (column in names(data)) {
#   if (column != 'CHAS') {
#     winsorized_data[, column] <- winsorize(data[, column], winsor_value = 0.05)
#   }
# }
#outliers=229

In [None]:
cat("First few rows of the data:\n")
head(log_transformed_data)

In [None]:

# Initialize a data frame to store outliers
outliers_df <- data.frame(Column = character(), Outliers = character(), stringsAsFactors = FALSE)

# Loop through each column
for (col in names(log_transformed_data)) {
  # Calculate the IQR
  Q1 <- quantile(log_transformed_data[[col]], 0.25)
  Q3 <- quantile(log_transformed_data[[col]], 0.75)
  IQR <- Q3 - Q1

  # Define lower and upper bounds for outliers
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR

  # Check for outliers
  outliers <- log_transformed_data[[col]][data[[col]] < lower_bound | data[[col]] > upper_bound]

  # If outliers are found, add them to the outliers data frame
  if (length(outliers) > 0) {
    outliers_df <- rbind(outliers_df, data.frame(Column = col, Outliers = paste(outliers, collapse = ", "), stringsAsFactors = FALSE))
  }
}



In [None]:
# Calculate the IQR for each column
Q1 <- apply(log_transformed_data, 2, quantile, probs = 0.25)
Q3 <- apply(log_transformed_data, 2, quantile, probs = 0.75)
IQR <- Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Identify rows with outliers in any column
outlier_rows <- apply(log_transformed_data, 1, function(row) any(row < lower_bound | row > upper_bound))

# Remove outliers by subsetting the data
clean_data <- log_transformed_data[!outlier_rows, ]
outliers_in_data <- log_transformed_data[outlier_rows, ]
# Display the dimensions of the original and cleaned datasets
cat("Original dataset dimensions:", dim(log_transformed_data), "\n")
cat("Cleaned dataset dimensions:", dim(clean_data), "\n")
cat("Outiers available:",dim(outliers_in_data),"\n")

# **Exploratory Data Analysis**


# Univariate

In [None]:
library(RColorBrewer)


color_palette <- brewer.pal(9, "Set1")


selected_columns <- names(log_transformed_data)[!(names(log_transformed_data) %in% c('B', 'RAD', 'CHAS','ZN'))]


num_columns <- length(selected_columns)
num_colors <- min(num_columns, length(color_palette))


sampled_colors <- sample(color_palette, num_colors)


for (i in seq_along(selected_columns)) {
  column <- selected_columns[i]
  color <- sampled_colors[i %% num_colors + 1]  


  plot <- ggplot(log_transformed_data, aes(x = .data[[column]])) +
    geom_histogram(color = "black", fill = color, alpha = 0.7, bins = 50) +
    labs(title = paste("Histogram of", column),
         x = column,
         y = "Frequency") +
    theme(plot.title = element_text(size = 20, hjust = 0.5),
          axis.title = element_text(size = 18),
          axis.text = element_text(size = 15))

 
  print(plot)
}

In [None]:
threshold <- 0.000001

log_transformed_data$ZN_group <- ifelse(log_transformed_data$ZN > threshold, "ZN > 0", "ZN <= 0")


data_zn <- as.data.frame(table(log_transformed_data$ZN_group))

names(data_zn) <- c("ZN_group", "Count")

ggplot(data_zn, aes(x = ZN_group, y = Count, fill = ZN_group)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("ZN > 0" = "purple", "ZN <= 0" = "lightblue")) +  
  labs(title = "ZN Grouping",
       x = "ZN Group",
       y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(size = 25, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text = element_text(size = 20))

In [None]:
log_transformed_data <- subset(log_transformed_data, select = -c(ZN_group))

In [None]:
print(unique(log_transformed_data$RAD))


In [None]:
options(repr.plot.width=10, repr.plot.height=10)


total_houses <- nrow(log_transformed_data)


ggplot(log_transformed_data, aes(x = factor(total_houses), fill = factor(RAD))) +
  geom_bar(width = 0.5, color = "black") +
  coord_polar(theta = "y") +
  labs(title = "Accessibility to Highways",
       fill = "Accessibility to Highways",
       x = "Number of Houses (Count)",  
       y = NULL) +
  theme_bw()+  
  theme(plot.title = element_text(size = 25, hjust = 0.5),
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 12)) +
  theme(axis.title = element_blank(),  
        axis.text = element_blank(),   
        axis.ticks = element_blank(),  
        panel.border = element_blank(),  
        panel.grid = element_blank())

In [None]:
sum(is.na(log_transformed_data))


In [None]:
data_chas <- as.data.frame(table(log_transformed_data$CHAS))

names(data_chas) <- c("CHAS", "Count")

data_chas$CHAS <- factor(data_chas$CHAS, levels = c(0, 1), labels = c("No", "Yes"))

ggplot(data_chas, aes(x = CHAS, y = Count, fill = CHAS)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("No" = "brown", "Yes" = "salmon")) +  
  labs(title = "Next to Charles River?",
       x = "Property Located Next to the River?",
       y = "Number of Houses") +
  theme_bw() +
  theme(plot.title = element_text(size = 25, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text = element_text(size = 20))

In [None]:
sum(log_transformed_data$B < 5)


In [None]:
log_transformed_data$B_group <- ifelse(log_transformed_data$B < 5, "B < 5", "B >= 5")

In [None]:
data_b <- as.data.frame(table(log_transformed_data$B_group))

names(data_b) <- c("B_group", "Count")


ggplot(data_b, aes(x = B_group, y = Count, fill = B_group)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("B < 5" = "green", "B >= 5" = "lightgreen")) +  # Change colors here
  labs(title = "Ethnicity Factor",
       x = "B Group",
       y = "Number of Houses") +
  theme_bw() +
  theme(plot.title = element_text(size = 25, hjust = 0.5),
        axis.title = element_text(size = 20),
        axis.text = element_text(size = 20))

# Bivariate

In [None]:
log_transformed_data$RM <- log(data$RM)
log_transformed_data$PRICE <- log(data$PRICE)

plot(log_transformed_data$RM, log_transformed_data$PRICE, 
     xlab = "Rooms", 
     ylab = "Median Home Price (PRICE)", 
     main = "Average number of rooms vs. Median Home Price",
    col = "purple")
log_transformed_data$RM <- log(data$RM)
log_transformed_data$PRICE <- log(data$PRICE)

plot(log_transformed_data$RM, log_transformed_data$PRICE, 
     xlab = "Rooms", 
     ylab = "Median Home Price (PRICE)", 
     main = "Average number of rooms vs. Median Home Price",
    col = "purple")

In [None]:
log_transformed_data <- log_transformed_data

columns <- c("LSTAT", "PRICE")

plot(log_transformed_data[columns], 
      main = "Plot of Selected Variables",
      col = "brown")

In [None]:
log_transformed_data <- log_transformed_data

columns <- c("DIS", "PRICE")


plot(log_transformed_data[columns], 
      main = "Plot of Selected Variables",
      col = "darkgreen")

In [None]:
log_transformed_data <- log_transformed_data

columns <- c("DIS", "LSTAT")


plot(log_transformed_data[columns], 
      main = "Plot of Selected Variables",
      col = "darkblue")

In [None]:
ggplot(log_transformed_data, aes(x = factor(RAD), y = PRICE, fill = factor(RAD))) +
  geom_violin(alpha = 0.5) +
  labs(x = "Highway Accessibility (RAD)", y = "Median Home Price") +
  ggtitle("Violin Plot of Median Home Prices by Highway Accessibility   ")

In [None]:
palette <- c("#E5D8BD", "#B8A38A")

ggplot(log_transformed_data, aes(x = factor(B_group), y = PRICE)) +
  geom_boxplot(fill = palette) +
  labs(x = "Ethnicity Factor (B)", y = "Median Home Price") +
  ggtitle("Box plot of Median Home Prices by Ethnicity Factor")

# Multivariate

In [None]:
library(ggplot2)
library(reshape2) 


correlation_matrix <- cor(data[, c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "PRICE")])


ggplot(data = melt(correlation_matrix), aes(Var1, Var2, fill = value, label = round(value, 2))) +
  geom_tile(color = "white") +
  geom_text(color = "black", size = 3) +  
  scale_fill_gradient2(low = "lightblue", high = "darkred", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1))

In [None]:
variables <- c("INDUS", "NOX", "RM", "DIS", "LSTAT", "PRICE")

ggpairs(data[variables])

# **CHAS Variances**

In [None]:
# Split the data into two groups based on CHAS
group0 <- log_transformed_data[log_transformed_data$CHAS == 0, "PRICE"]
group1 <- log_transformed_data[log_transformed_data$CHAS == 1, "PRICE"]

# Calculate the variance for each group
variance_group0 <- var(group0)
variance_group1 <- var(group1)

# Print the variances
cat("Variance for group with CHAS=0:", variance_group0, "\n")
cat("Variance for group with CHAS=1:", variance_group1, "\n")


# **Welch's t-test**

In [None]:
# Split the data into two groups based on CHAS
group0 <- log_transformed_data[log_transformed_data$CHAS == 0, "PRICE"]
group1 <- log_transformed_data[log_transformed_data$CHAS == 1, "PRICE"]

# Perform the t-test
t_test_result <- t.test(group0, group1, var.equal = FALSE)

# Print the result
print(t_test_result)


In [None]:
qt(p=0.05/2, df=39.733, lower.tail = FALSE)

# **Student's t-test**

In [None]:
# Split the data into two groups based on CHAS
group0 <- log_transformed_data[log_transformed_data$CHAS == 0, "PRICE"]
group1 <- log_transformed_data[log_transformed_data$CHAS == 1, "PRICE"]

# Perform the t-test
t_test_result <- t.test(group0, group1, var.equal = TRUE)

# Print the result
print(t_test_result)

In [None]:
qt(p=0.05/2, df=504, lower.tail = FALSE)     

# **RAD Variances**

# **Welcoxon Rank Sum Test**



In [None]:
# Split the data into two groups based on CHAS
group0 <- log_transformed_data[log_transformed_data$CHAS == 0, "PRICE"]
group1 <- log_transformed_data[log_transformed_data$CHAS == 1, "PRICE"]
mean0<-mean(group0)
mean1<-mean(group1)
variance0<-var(group0)
variance1<-var(group1)

cat("Mean for group with CHAS=0:", mean0, "\n")
cat("Mean for group with CHAS=1:", mean1, "\n")


In [None]:
boxplot(group0,group1,names=c('Chas0','Chas1'),main='Median House Price Grouped by CHAS',ylab='Median House Price')

In [None]:
wilcox.test(group0,group1)


p value is less than 0.005 thus we reject the null hypothesis, which means that both distributions are not equal


# **Data Splitting**

In [None]:
# Define the variables for the linear regression
X <- log_transformed_data[, c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT")]
Y <- log_transformed_data$PRICE

# Split the data into training and testing sets
set.seed(123)
train_index <- sample(1:nrow(log_transformed_data), 0.7 * nrow(log_transformed_data))
X_train <- X[train_index, ]
Y_train <- Y[train_index]
X_test <- X[-train_index, ]
Y_test <- Y[-train_index]

# **Linear regression**

In [None]:
# Fit the linear regression model
model <- lm(Y_train ~ ., data = cbind(Y_train, X_train))

summary(model)

In [None]:
# Make predictions
predictions <- predict(model, newdata = X_test)

In [None]:
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(Y_test - predictions))
print(paste("MAE:", mae))

# Calculate Mean Squared Error (MSE)
mse <- mean((Y_test - predictions)^2)
print(paste("MSE:", mse))

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((Y_test - predictions)^2))
print(paste("RMSE:", rmse))

# Calculate R-squared
rsquared <- 1 - sum((Y_test - predictions)^2) / sum((Y_test - mean(Y_test))^2)
print(paste("R-squared:", rsquared))

In [None]:
# Create a data frame with predicted and actual values
comparison_df <- data.frame(Predicted = predictions, Actual = Y_test)

# Print the first few rows of the dataframe
head(comparison_df)

# **Linear Regression with Cross Validation**¶


In [None]:
library(caret)

# Define the control method for cross-validation
train_control <- trainControl(method="cv", number=10)

# Train the model using cross-validation with linear regression
model_lr <- train(Y_train ~ ., data=data.frame(X_train, Y_train), method="lm", trControl=train_control)

# Make predictions on the test set
predictions_lr <- predict(model_lr, newdata=data.frame(X_test))

# Calculate MSE for the test set
mse_lr <- mean((Y_test - predictions_lr)^2)
print(mse_lr)
postResample(predictions_lr, Y_test)

# **Lasso Regression**

In [None]:
# Install and load the glmnet package
library(glmnet)

# Convert the data into a format suitable for glmnet
x_train <- as.matrix(X_train)
x_test <- as.matrix(X_test)

# Fit the Lasso regression model
lasso_model <- glmnet(x_train, Y_train, alpha = 1)

# Make predictions on the test set
lasso_pred <- predict(lasso_model, s = 0.01, newx = x_test)

In [None]:
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(Y_test - lasso_pred))
print(paste("MAE:", mae))

# Calculate Mean Squared Error (MSE)
mse <- mean((Y_test - lasso_pred)^2)
print(paste("MSE:", mse))

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((Y_test - lasso_pred)^2))
print(paste("RMSE:", rmse))

# Calculate R-squared
rsquared <- 1 - sum((Y_test - lasso_pred)^2) / sum((Y_test - mean(Y_test))^2)
print(paste("R-squared:", rsquared))

# **Lasso Regression with Cross Validation**


In [None]:
library(glmnet)

# Perform k-fold cross-validation to find the optimal lambda value
set.seed(123) # For reproducibility
cv_model_lasso <- cv.glmnet(as.matrix(X_train), Y_train, alpha = 1)

# Fit the lasso model using the best lambda value on the training data
lasso_model <- glmnet(as.matrix(X_train), Y_train, alpha = 1, lambda = cv_model_lasso$lambda.min)

# Make predictions on the test set
predictions_lasso <- predict(lasso_model, s = cv_model_lasso$lambda.min, newx = as.matrix(X_test))

# Calculate performance metrics
mse_lasso <- mean((Y_test - predictions_lasso)^2)
mse_lasso
postResample(predictions_lasso, Y_test)

# Decision Tree

In [None]:
# Load the necessary library
library(rpart)

# Fit the decision tree model
fit <- rpart(Y_train ~ ., data = data.frame(X_train, Y_train), method = "anova")

# Print the decision tree model
print(fit)

# Plot the decision tree
plot(fit)
text(fit, use.n = TRUE)

# Predict on the test set
predictions <- predict(fit, newdata = data.frame(X_test))




In [None]:
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(Y_test - predictions))
print(paste("MAE:", mae))

# Calculate Mean Squared Error (MSE)
mse <- mean((Y_test - predictions)^2)
print(paste("MSE:", mse))

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((Y_test - predictions)^2))
print(paste("RMSE:", rmse))

# Calculate R-squared
rsquared <- 1 - sum((Y_test - predictions)^2) / sum((Y_test - mean(Y_test))^2)
print(paste("R-squared:", rsquared))

# **Decision Tree with Cross Validation**

In [None]:
# Assuming X_Train, Y_Train, X_Test, and Y_Test are already defined

# Define the control method for cross-validation
train_control <- trainControl(method="cv", number=10)

# Train the model using cross-validation with Decision Tree
model_dt <- train(Y_train ~ ., data=data.frame(X_train, Y_train), method="rpart", trControl=train_control)

# Make predictions on the test set
predictions_dt <- predict(model_dt, newdata=data.frame(X_test))

# Calculate performance metrics
postResample(predictions_dt, Y_test)

# SVM

In [None]:
# Load the necessary library
library(e1071)

# Fit the SVM model for regression
svm_model <- svm(Y_train ~ ., data = data.frame(X_train, Y_train))

# Print the SVM model details
print(svm_model)

# Predict on the test set using the SVM model
svm_predictions <- predict(svm_model, newdata = data.frame(X_test))

In [None]:
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(Y_test - svm_predictions))
print(paste("MAE:", mae))

# Calculate Mean Squared Error (MSE)
mse <- mean((Y_test - svm_predictions)^2)
print(paste("MSE:", mse))

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((Y_test - svm_predictions)^2))
print(paste("RMSE:", rmse))

# Calculate R-squared
rsquared <- 1 - sum((Y_test - svm_predictions)^2) / sum((Y_test - mean(Y_test))^2)
print(paste("R-squared:", rsquared))

# **SVR with Cross Validation**

In [None]:
# Define the control method for cross-validation
train_control <- trainControl(method="cv", number=10)

# Train the model using cross-validation with SVM
model_svm <- train(Y_train ~ ., data=data.frame(X_train, Y_train), method="svmRadial", trControl=train_control)

# Make predictions on the test set
predictions_svm <- predict(model_svm, newdata=data.frame(X_test))

# Calculate performance metrics
postResample(predictions_svm, Y_test)

# KNN

In [None]:
# Load the necessary library
library(kknn)

# Scale the data
scaled_X_train <- scale(X_train)
scaled_X_test <- scale(X_test)

# Fit the KNN model
set.seed(123) # for reproducibility
knn_fit <- train.kknn(Y_train ~ ., data = data.frame(scaled_X_train, Y_train), kmax = 20)

# Find the best k value
best_k <- knn_fit$best.parameters$k

# Predict on the test set using the KNN model
knn_predictions <- predict(knn_fit, newdata = data.frame(scaled_X_test))

In [None]:
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(Y_test - knn_predictions))
print(paste("MAE:", mae))

# Calculate Mean Squared Error (MSE)
mse <- mean((Y_test - knn_predictions)^2)
print(paste("MSE:", mse))

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((Y_test - knn_predictions)^2))
print(paste("RMSE:", rmse))

# Calculate R-squared
rsquared <- 1 - sum((Y_test - knn_predictions)^2) / sum((Y_test - mean(Y_test))^2)
print(paste("R-squared:", rsquared))

# **KNN with Cross Validation**


In [None]:
# Define the control method for cross-validation
train_control <- trainControl(method="cv", number=10)

# Train the model using cross-validation with KNN
model_knn <- train(Y_train ~ ., data=data.frame(X_train, Y_train), method="knn", trControl=train_control)

# Make predictions on the test set
predictions_knn <- predict(model_knn, newdata=data.frame(X_test))

# Calculate performance metrics
postResample(predictions_knn, Y_test)

# Stacking Ensemble


In [None]:
# Combine predictions with actual labels for the test set
stacked_predictions_test <- data.frame(Y_test = Y_test,
                                       knn = predict(knn_fit, newdata = data.frame(scaled_X_test)),
                                       svm = predict(svm_model, newdata = data.frame(X_test)),
                                       tree = predict(fit, newdata = data.frame(X_test)),
                                       lasso = predict(lasso_model, s = 0.01, newx = x_test))

# Train a meta-model on the combined predictions
meta_model <- lm(Y_test ~ ., data = stacked_predictions_test)

# Make predictions on the test set using the meta-model
meta_predictions <- predict(meta_model, newdata = stacked_predictions_test)




In [None]:
# Calculate Mean Absolute Error (MAE) for the meta-model
meta_mae <- mean(abs(stacked_predictions_test$Y_test - meta_predictions))
print(paste("Stacking Ensemble - Meta-model MAE:", meta_mae))

# Calculate Mean Squared Error (MSE) for the meta-model
meta_mse <- mean((stacked_predictions_test$Y_test - meta_predictions)^2)
print(paste("Stacking Ensemble - Meta-model MSE:", meta_mse))

# Calculate Root Mean Squared Error (RMSE) for the meta-model
meta_rmse <- sqrt(mean((stacked_predictions_test$Y_test - meta_predictions)^2))
print(paste("Stacking Ensemble - Meta-model RMSE:", meta_rmse))

# Calculate R-squared for the meta-model
meta_rsquared <- 1 - sum((stacked_predictions_test$Y_test - meta_predictions)^2) / sum((stacked_predictions_test$Y_test - mean(stacked_predictions_test$Y_test))^2)
print(paste("Stacking Ensemble - Meta-model R-squared:", meta_rsquared))
