In [None]:
#instalação de pacotes necessários
#install.packages("e1071")
#install.packages("caret")
#install.packages("mlbench")
#install.packages("mice")
#install.packages("Metrics")
#install.packages("randomForest")
library(caret)
library(mlbench)
library(mice)
library(Metrics)
library("randomForest")


In [3]:
#carregamento da base
dados <- read.csv("databases/9 - Admissao - Dados.csv")
head(dados)

Unnamed: 0_level_0,num,GRE.Score,TOEFL.Score,University.Rating,SOP,LOR,CGPA,Research,ChanceOfAdmit
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1,1,337,118,4,4.5,4.5,9.65,1,0.92
2,2,324,107,4,4.0,4.5,8.87,1,0.76
3,3,316,104,3,3.0,3.5,8.0,1,0.72
4,4,322,110,3,3.5,2.5,8.67,1,0.8
5,5,314,103,2,2.0,3.0,8.21,0,0.65
6,6,330,115,5,4.5,3.0,9.34,1,0.9


In [4]:
#retirando coluna de id e separando em bases de treino e teste
dados$num <- NULL
set.seed(2034)
indice <- createDataPartition(dados$ChanceOfAdmit, p = 0.8, list = FALSE)

treino <- dados[indice, ]
teste <- dados[-indice, ]

In [5]:
#funcoes das metricas usadas

#rmse(observado, predito)
#mae(observado, predito)
#cor(observado, predito, method = "pearson")

syx <- function(observado, predito, p) {
  return(sqrt(sum((observado - predito)^2) / (length(observado) - p)))
}

r2 <- function(observado, predito){
  return(1 - (sum((predito - observado)^2) / sum((observado - mean(observado))^2)))
}

### KNN
---

In [6]:
#treinando o modelo usando KNN e mostrando as metricas
set.seed(2034)
tune_grid <- expand.grid(k = c(1, 3, 5, 7, 9, 10))
knn <- train(ChanceOfAdmit ~ ., data = treino, method = "knn", tuneGrid = tune_grid)
knn

#fazendo a predicaõ na base de teste
predito_knn <- predict(knn, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_knn)
syx_valor <- syx(teste$ChanceOfAdmit, predito_knn, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_knn, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_knn)
mae_valor <- mae(teste$ChanceOfAdmit, predito_knn)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")


k-Nearest Neighbors 

402 samples
  7 predictor

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 402, 402, 402, 402, 402, 402, ... 
Resampling results across tuning parameters:

  k   RMSE        Rsquared   MAE       
   1  0.09539771  0.5844600  0.07053102
   3  0.08613130  0.6404006  0.06502697
   5  0.08163155  0.6708952  0.06178498
   7  0.07852178  0.6925019  0.05967449
   9  0.07717264  0.7011436  0.05863885
  10  0.07658834  0.7054456  0.05824290

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was k = 10.

R²: 0.7337591 
Syx: 0.07730253 
Pearson: 0.8578415 
RMSE: 0.07449058 
MAE: 0.05307143 


### RNA com hold-out e cross-validation
---

In [7]:
#treinando o modelo usando RNA com hold-out e mostrando as metricas
set.seed(2034)
rna <- train(ChanceOfAdmit ~ ., data = treino, method = "nnet", linout = TRUE, trace = FALSE)
rna

#fazendo a predicaõ na base de teste
predito_rna <- predict(rna, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_rna)
syx_valor <- syx(teste$ChanceOfAdmit, predito_rna, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_rna, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_rna)
mae_valor <- mae(teste$ChanceOfAdmit, predito_rna)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

"There were missing values in resampled performance measures."


Neural Network 

402 samples
  7 predictor

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 402, 402, 402, 402, 402, 402, ... 
Resampling results across tuning parameters:

  size  decay  RMSE        Rsquared    MAE       
  1     0e+00  0.14032458  0.02877529  0.11324158
  1     1e-04  0.12408230  0.78594186  0.09917273
  1     1e-01  0.07300487  0.73193439  0.05481015
  3     0e+00  0.13406649  0.52679406  0.10776591
  3     1e-04  0.09129913  0.58344842  0.07041705
  3     1e-01  0.07117772  0.74566238  0.05330257
  5     0e+00  0.14032462         NaN  0.11324162
  5     1e-04  0.09803328  0.60635345  0.07592078
  5     1e-01  0.07050542  0.74927552  0.05261724

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were size = 5 and decay = 0.1.

R²: 0.7644812 
Syx: 0.07270581 
Pearson: 0.8902279 
RMSE: 0.07006107 
MAE: 0.05395539 


In [8]:
#treinando o modelo usando RNA com crossvalidation e mostrando as metricas
set.seed(2034)
ctrl <- trainControl(method = "cv", number = 10)
rna <- train(ChanceOfAdmit ~ ., data = treino, method = "nnet", linout = TRUE, trace = FALSE, trControl = ctrl)
rna

#fazendo a predicaõ na base de teste
predito_rna <- predict(rna, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_rna)
syx_valor <- syx(teste$ChanceOfAdmit, predito_rna, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_rna, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_rna)
mae_valor <- mae(teste$ChanceOfAdmit, predito_rna)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

"There were missing values in resampled performance measures."


Neural Network 

402 samples
  7 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 362, 362, 362, 362, 362, 362, ... 
Resampling results across tuning parameters:

  size  decay  RMSE        Rsquared    MAE       
  1     0e+00  0.14001171  0.03448521  0.11348967
  1     1e-04  0.11612518  0.80740444  0.09170667
  1     1e-01  0.07183904  0.74291857  0.05423172
  3     0e+00  0.13242029  0.77579711  0.10617168
  3     1e-04  0.10104476  0.48931921  0.07937919
  3     1e-01  0.06870575  0.76634456  0.05200679
  5     0e+00  0.14001171         NaN  0.11348968
  5     1e-04  0.10870790  0.55039676  0.08647590
  5     1e-01  0.06726759  0.77536286  0.05057755

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were size = 5 and decay = 0.1.

R²: 0.7835564 
Syx: 0.06969935 
Pearson: 0.8994475 
RMSE: 0.06716397 
MAE: 0.05079213 


### SVM com hold-out e crossvalidation
---

In [9]:
#treinando o modelo usando svm com hold-out e mostrando as metricas
set.seed(2034)
tune_grid <- expand.grid(C = c(1, 2, 10, 50, 100), sigma = c(.021, .015, 0.2))

svm <- train(ChanceOfAdmit ~ ., data = treino, method = "svmRadial", tuneGrid = tune_grid)
svm

#fazendo a predicaõ na base de teste
predito_svm <- predict(svm, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_svm)
syx_valor <- syx(teste$ChanceOfAdmit, predito_svm, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_svm, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_svm)
mae_valor <- mae(teste$ChanceOfAdmit, predito_svm)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

Support Vector Machines with Radial Basis Function Kernel 

402 samples
  7 predictor

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 402, 402, 402, 402, 402, 402, ... 
Resampling results across tuning parameters:

  C    sigma  RMSE        Rsquared   MAE       
    1  0.015  0.06346808  0.8018751  0.04486103
    1  0.021  0.06345164  0.8015610  0.04485037
    1  0.200  0.06875863  0.7632965  0.04861218
    2  0.015  0.06313846  0.8032184  0.04461294
    2  0.021  0.06332093  0.8017887  0.04468783
    2  0.200  0.07095756  0.7481746  0.05014748
   10  0.015  0.06345806  0.8001990  0.04474193
   10  0.021  0.06394312  0.7971513  0.04486709
   10  0.200  0.08122793  0.6840885  0.05753888
   50  0.015  0.06477869  0.7914230  0.04543526
   50  0.021  0.06579458  0.7847843  0.04604196
   50  0.200  0.09723968  0.5889766  0.06949631
  100  0.015  0.06545685  0.7867745  0.04583958
  100  0.021  0.06698799  0.7770994  0.04700667
  100  0.200  0.10576636  0.54337

R²: 0.8308444 
Syx: 0.06161685 
Pearson: 0.9138469 
RMSE: 0.05937548 
MAE: 0.03855493 


In [10]:
#treinando o modelo usando svm com crossvalidation e mostrando as metricas
set.seed(2034)
tune_grid <- expand.grid(C = c(1, 2, 10, 50, 100), sigma = c(.021, .015, 0.2))
ctrl <- trainControl(method = "cv", number = 10)

svm <- train(ChanceOfAdmit ~ ., data = treino, method = "svmRadial", tuneGrid = tune_grid, trControl = ctrl)
svm

#fazendo a predicaõ na base de teste
predito_svm <- predict(svm, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_svm)
syx_valor <- syx(teste$ChanceOfAdmit, predito_svm, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_svm, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_svm)
mae_valor <- mae(teste$ChanceOfAdmit, predito_svm)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

Support Vector Machines with Radial Basis Function Kernel 

402 samples
  7 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 362, 362, 362, 362, 362, 362, ... 
Resampling results across tuning parameters:

  C    sigma  RMSE        Rsquared   MAE       
    1  0.015  0.06145847  0.8149835  0.04361310
    1  0.021  0.06128694  0.8160212  0.04338731
    1  0.200  0.06256773  0.8077601  0.04431178
    2  0.015  0.06094677  0.8176612  0.04305197
    2  0.021  0.06068989  0.8192443  0.04280753
    2  0.200  0.06368395  0.7998517  0.04491995
   10  0.015  0.06036552  0.8207167  0.04282773
   10  0.021  0.06054174  0.8198753  0.04271420
   10  0.200  0.06995668  0.7589076  0.04962527
   50  0.015  0.06030815  0.8201683  0.04229341
   50  0.021  0.06055137  0.8180904  0.04221130
   50  0.200  0.08620734  0.6696158  0.06251552
  100  0.015  0.06051617  0.8182944  0.04221556
  100  0.021  0.06133946  0.8138582  0.04270760
  100  0.200  0.09999119  0.59

R²: 0.8208342 
Syx: 0.06341381 
Pearson: 0.9071802 
RMSE: 0.06110708 
MAE: 0.0412044 


### Random Forest com hold-out e Crossvalidation
---

In [11]:
#treinando o modelo usando svm com hold-out e mostrando as metricas
set.seed(2034)

rf <- train(ChanceOfAdmit ~ ., data = treino, method = "rf")
rf

#fazendo a predicaõ na base de teste
predito_rf <- predict(rf, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_rf)
syx_valor <- syx(teste$ChanceOfAdmit, predito_rf, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_rf, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_rf)
mae_valor <- mae(teste$ChanceOfAdmit, predito_rf)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

Random Forest 

402 samples
  7 predictor

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 402, 402, 402, 402, 402, 402, ... 
Resampling results across tuning parameters:

  mtry  RMSE        Rsquared   MAE       
  2     0.06486753  0.7879264  0.04685562
  4     0.06547994  0.7838329  0.04727385
  7     0.06778651  0.7696270  0.04896324

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.

R²: 0.8136367 
Syx: 0.06467502 
Pearson: 0.9061176 
RMSE: 0.06232241 
MAE: 0.04263619 


In [12]:
#treinando o modelo usando svm com crossvalidarion e mostrando as metricas
set.seed(2034)
ctrl <- trainControl(method = "cv", number = 10)
rf <- train(ChanceOfAdmit ~ ., data = treino, method = "rf", trControl = ctrl)
rf

#fazendo a predicaõ na base de teste
predito_rf <- predict(rf, teste)

#calculando as metricas
r2_valor <- r2(teste$ChanceOfAdmit, predito_rf)
syx_valor <- syx(teste$ChanceOfAdmit, predito_rf, ncol(teste) - 1)
pearson_valor <- cor(teste$ChanceOfAdmit, predito_rf, method = "pearson")
rmse_valor <- rmse(teste$ChanceOfAdmit, predito_rf)
mae_valor <- mae(teste$ChanceOfAdmit, predito_rf)

#apresentando as metricas
cat("R²:", r2_valor, "\n")
cat("Syx:", syx_valor, "\n")
cat("Pearson:", pearson_valor, "\n")
cat("RMSE:", rmse_valor, "\n")
cat("MAE:", mae_valor, "\n")

Random Forest 

402 samples
  7 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 362, 362, 362, 362, 362, 362, ... 
Resampling results across tuning parameters:

  mtry  RMSE        Rsquared   MAE       
  2     0.06165044  0.8107474  0.04433226
  4     0.06181797  0.8097311  0.04443413
  7     0.06376245  0.7981134  0.04580971

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.

R²: 0.8115637 
Syx: 0.06503372 
Pearson: 0.9051472 
RMSE: 0.06266806 
MAE: 0.0428898 
