In [None]:
# Project: Student Performance Prediction using Multiple Linear Regression

# 1. Load Required Libraries
install.packages(c("readxl", "ggplot2", "olsrr", "car", "MASS", "e1071", "dplyr", "lmtest", "caret", "Metrics"))
library(readxl)
library(ggplot2)
library(olsrr)
library(car)
library(MASS)
library(e1071)
library(dplyr)
library(lmtest)
library(caret)
library(Metrics)

# 2. Load and Explore the Data
df <- read.csv("student-behaviour.csv")
str(df)
summary(df)


# 3. Data Cleaning & Preparation
df <- df[!rowSums(sapply(df, is.infinite)), ]
df <- na.omit(df)

# Transform skewed features
df$salary.expectation <- sqrt(df$salary.expectation)
df$Height.CM. <- sqrt(df$Height.CM.)
df$college.mark <- log(df$college.mark + abs(min(df$college.mark, na.rm = TRUE)) + 1)

# 4. Train-Test Split

set.seed(123)
split <- createDataPartition(df$college.mark, p = 0.8, list = FALSE)
train <- df[split, ]
test <- df[-split, ]


# 5. Model Training and Cross-Validation

model <- train(college.mark ~ ., data = train, method = "lm",
               trControl = trainControl(method = "cv", number = 5))
summary(model)


# 6. Prediction and Evaluation

predictions <- predict(model, newdata = test)
actual <- test$college.mark

# Metrics
rmse_val <- rmse(actual, predictions)
mae_val <- mae(actual, predictions)

cat("RMSE:", rmse_val, "\n")
cat("MAE:", mae_val, "\n")


# 7. ggplot2 Visualizations

# Residual Plot
residuals_df <- data.frame(Fitted = predictions, Residuals = actual - predictions)
ggplot(residuals_df, aes(x = Fitted, y = Residuals)) +
  geom_point(color = "steelblue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

# Actual vs Predicted
comparison_df <- data.frame(Actual = actual, Predicted = predictions)
ggplot(comparison_df, aes(x = Actual, y = Predicted)) +
  geom_point(color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs Predicted Values", x = "Actual College Marks", y = "Predicted") +
  theme_minimal()


# 8. Q-Q Plots

# Residuals Q-Q plot
qqnorm(actual - predictions, main = "Q-Q Plot of Residuals")
qqline(actual - predictions, col = "red")

# Actual values Q-Q plot
qqnorm(actual, main = "Q-Q Plot of Actual College Marks")
qqline(actual, col = "blue")

# 9. Boxplots by Key Categorical Features

# Boxplot: College Marks by Gender
ggplot(df, aes(x = Gender, y = college.mark)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "College Marks by Gender", x = "Gender", y = "College Mark (log-transformed)") +
  theme_minimal()

# Boxplot: College Marks by Certification Course
ggplot(df, aes(x = Certification.Course, y = college.mark)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "College Marks by Certification Course", x = "Certification Course", y = "College Mark (log-transformed)") +
  theme_minimal()

# Boxplot: College Marks by Department
ggplot(df, aes(x = Department, y = college.mark)) +
  geom_boxplot(fill = "orange") +
  labs(title = "College Marks by Department", x = "Department", y = "College Mark (log-transformed)") +
  theme_minimal()

# Boxplot: College Marks by Stress Level
ggplot(df, aes(x = Stress.Level, y = college.mark)) +
  geom_boxplot(fill = "salmon") +
  labs(title = "College Marks by Stress Level", x = "Stress Level", y = "College Mark (log-transformed)") +
  theme_minimal()
