In [1]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2.[31m9000[39m     [32m✔[39m [34mpurrr  [39m 0.3.4     
[32m✔[39m [34mtibble [39m 3.0.2          [32m✔[39m [34mdplyr  [39m 1.0.0     
[32m✔[39m [34mtidyr  [39m 1.1.0          [32m✔[39m [34mstringr[39m 1.4.0     
[32m✔[39m [34mreadr  [39m 1.3.1          [32m✔[39m [34mforcats[39m 0.5.0     

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
#####  Importing the training dataset and the validating dataset(without 'Target')  #####


data_origin <- read.csv('../input/titanic/train.csv')
data_train <- data_origin
data_validate <- read.csv('../input/titanic/test.csv')
data_to_clean <- list(data_train, data_validate)
data_to_clean_names <- list('data_train', 'data_Validate')

#####  Obtaining info about datasets  #####

for (i in 1:2){
  print(paste('Summary of the ',data_to_clean_names[i]))
  print(summary(data.frame(data_to_clean[i])))
}

# Checking if there is any empty or 'NaN' data
for (dataset in data_to_clean){
  print(data.frame(colSums(is.na(dataset) | dataset=='')))
}

#####  Replacing empty data with median and mode  #####

# Creating function to find mode()
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Dealing with training set
data_train$Age <- ifelse(is.na(data_train$Age) | data_train$Age=='', 
                        ave(data_train$Age, FUN = function(x) median(x, na.rm = TRUE)),
                        data_train$Age);
data_train$Fare <- ifelse(is.na(data_train$Fare) | data_train$Fare=='', 
                         ave(data_train$Fare, FUN = function(x) median(x, na.rm = TRUE)),
                         data_train$Fare); 
data_train$Cabin <- ifelse(is.na(data_train$Cabin) | data_train$Cabin=='', 
                          0, 
                          1)
data_train$Embarked <- ifelse(is.na(data_train$Embarked) | data_train$Embarked=='', 
                             Mode(data_train$Embarked),
                             data_train$Embarked)

# Dealing with validating set
data_validate$Age <- ifelse(is.na(data_validate$Age) | data_validate$Age=='', 
                           ave(data_validate$Age, FUN = function(x) median(x, na.rm = TRUE)),
                           data_validate$Age);
data_validate$Fare <- ifelse(is.na(data_validate$Fare) | data_validate$Fare=='', 
                            ave(data_validate$Fare, FUN = function(x) median(x, na.rm = TRUE)),
                            data_validate$Fare); 
data_validate$Cabin <- ifelse(is.na(data_validate$Cabin) | data_validate$Cabin=='', 
                            0, 
                            1)
data_validate$Embarked <- ifelse(is.na(data_validate$Embarked) | data_validate$Embarked=='', 
                                ode(data_validate$Embarked),
                                data_validate$Embarked)

#####  Creating 'Title' category  #####

pattern <- ' ([a-zA-z]{2,})\\. '  # Text pattern to get title from the "Name" column

# Dealing with Training set
data_train$Title <- ''
for (i in 1:length(data_train$Name)) {
  m <- regexpr(pattern, data_train[i,'Name'], perl=FALSE, fixed=FALSE)
  data_train[i,'Title'] <- regmatches(data_train[i,'Name'], m)
}

# Dealing with Validating set
data_validate$Title <- ''
for (i in 1:length(data_validate$Name)) {
  m <- regexpr(pattern, data_validate[i,'Name'], perl=FALSE, fixed=FALSE)
  data_validate[i,'Title'] <- regmatches(data_validate[i,'Name'], m)
}

#####  Encoding Categorical data
title_set <- unique(c(data_train$Title,data_validate$Title))
title_labels <- seq(from = 1,to = length(title_set), by = 1)

data_to_clean <- list(data_train, data_validate)  # Collecting adjasted datasets

data_to_clean <- lapply(data_to_clean, function(df){
  df$Title <- factor(df$Title,
                     levels = title_set,
                     labels = title_labels)
  df$Sex <- factor(df$Sex,
                   levels = c('male', 'female'),
                   labels = c(0, 1))
  ### return ###
  df 
})
data_train <- data.frame(data_to_clean[1])  # Cleaned training data.frame
data_validate <- data.frame(data_to_clean[2])  # Cleaned validating data.frame

##### Droping Unnecessary variables  #####
data_train <- subset(data_train,TRUE, select=-c(PassengerId,Name,Ticket))
data_validate <- subset(data_validate,TRUE, select=-c(PassengerId,Name,Ticket))

##### Splitting Training dataset on train set and test set #####
library(caTools)
set.seed(0)
split = sample.split(data_train$Survived, SplitRatio = 0.8)
training_set <- subset(data_train, split == TRUE)
test_set <- subset(data_train, split == FALSE)


##### Feature-Scaling #####
# Data type checking (factor data type will be ignired for scaling process)
print(sapply(training_set,class))
print(sapply(data_validate,class))

# Scaling
training_set[-c(1,3,10)] <- scale(training_set[-c(1,3,10)])
test_set[-c(1,3,10)] <- scale(test_set[-c(1,3,10)])
data_validate[-c(2,9)] <- scale(data_validate[-c(2,9)])


[1] "Summary of the  data_train"
  PassengerId       Survived          Pclass     
 Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
 1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
 Median :446.0   Median :0.0000   Median :3.000  
 Mean   :446.0   Mean   :0.3838   Mean   :2.309  
 3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
 Max.   :891.0   Max.   :1.0000   Max.   :3.000  
                                                 
                                    Name         Sex           Age       
 Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
 Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
 Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
 Abelson, Mr. Samuel                  :  1                Mean   :29.70  
 Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
 Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
 (Other)                              :885       

In [3]:
#Define function
mypack <- function(package){
  new.package <- package[!(package %in% installed.packages()[, "Package"])]
  if (length(new.package)) 
    install.packages(new.package, dependencies = TRUE)
  sapply(package, require, character.only = TRUE)
}

packages = c('xgboost','e1071','rpart','ada','class','caret','randomForest')
mypack(packages)

Loading required package: xgboost


Attaching package: ‘xgboost’


The following object is masked from ‘package:dplyr’:

    slice


Loading required package: e1071

Loading required package: rpart

Loading required package: ada

Loading required package: class

Loading required package: caret

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


The following object is masked from ‘package:httr’:

    progress


Loading required package: randomForest

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:dplyr’:

    combine


The following object is masked from ‘package:ggplot2’:

    margin




In [4]:
##########  Machine Learning Algorithm (MLA) Selection  ##########
#install.packages('Boruta', repos="https://cran.rstudio.com/",type = "source") ##only for first time run the script
library(xgboost)
library(e1071)
library(rpart)
library(ada)
library(class)
library(caret)
library(randomForest)

# Creating data.frame("MLA","Accuracy") for collecting the results of the predictions
MLA_accuracy <- data.frame(MLA = character(), Accuracy = double(),stringsAsFactors=FALSE)
y_test <- as.factor(test_set$Survived)  # Need factor format for confusionMatrix() function

# Checking if all Features are independent and necessary
library(Boruta)
glm_boruta <- Boruta(Survived ~.,data = training_set, doTrace=0)
roughFixMod <- TentativeRoughFix(glm_boruta)
print(attStats(roughFixMod)[order(-attStats(roughFixMod)$meanImp),])  # Descending order by meanImp

# Logistic Regression
class_glm <- glm(formula = Survived ~.,
                family = binomial,
                data = training_set)
p_pred_glm <- predict(class_glm, type = 'response', newdata = test_set[-1])
y_pred_glm <- ifelse(p_pred_glm > 0.5, 1, 0)
y_pred_glm <- as.factor(y_pred_glm)
cm_glm <- confusionMatrix(y_pred_glm, y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('LogReg',cm_glm$overall['Accuracy'])  # To "MLA-Accuracy"

# XGBoost
class_xgb <- xgboost(data = as.matrix(sapply(training_set[-1],as.numeric)), 
                    label = as.numeric(training_set$Survived),
                    nrounds = 10)
y_pred_xgb <- predict(class_xgb, newdata = as.matrix(sapply(test_set[-1],as.numeric)))
y_pred_xgb <- ifelse(y_pred_xgb>=0.5, 1, 0)
y_pred_xgb <- as.factor(y_pred_xgb)
cm_xgb <- confusionMatrix(y_pred_xgb,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('XGBoost',cm_xgb$overall['Accuracy'])  # To "MLA-Accuracy"

# SVM
class_svm <- svm(formula = Survived ~.,
                data = training_set,
                type = 'C-classification')
y_pred_svm <- predict(class_svm, newdata = test_set[-1])
y_pred_svm <- as.factor(y_pred_svm)
cm_svm = confusionMatrix(y_pred_svm, y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('SVM',cm_svm$overall['Accuracy'])  # To "MLA-Accuracy"

# Naive Bayes
class_nb <- naiveBayes(as.factor(Survived) ~ .,
                      training_set)
y_pred_nb <- predict(class_nb, as.data.frame(test_set[-1]), type='class')
cm_nb <- confusionMatrix(y_pred_nb,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('NB',cm_nb$overall['Accuracy'])  # To "MLA-Accuracy"

# Decision Trees(CART)
class_dtree <- rpart(formula = Survived ~.,
                    data = training_set,
                    method = 'class')
y_pred_dtree <- predict(class_dtree, newdata = test_set[-1], type='class')
cm_dtree <- confusionMatrix(y_pred_dtree,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('DTree',cm_dtree$overall['Accuracy'])  # To "MLA-Accuracy"

# AdaBoost
class_ada <- ada(formula = Survived ~.,
            data = training_set)
y_pred_ada <- predict(class_ada, newdata = test_set[-1])
cm_ada <- confusionMatrix(y_pred_ada,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('ADA',cm_ada$overall['Accuracy'])  # To "MLA-Accuracy"

# KNN
y_pred_knn <- knn(train = training_set[,-1],
                 test = test_set[,-1], 
                 cl = training_set[,1],
                 k = 5)
cm_knn <- confusionMatrix(y_pred_knn,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('KNN',cm_knn$overall['Accuracy'])  # To "MLA-Accuracy"

# Random Forest
class_rforest <- randomForest(formula = Survived ~.,
                             data = training_set,
                             ntree = 100)
y_pred_rforest <- predict(class_rforest, newdata = test_set, type = 'response')
y_pred_rforest <- as.factor(ifelse(y_pred_rforest>0.5, 1, 0))
cm_rforest <- confusionMatrix(y_pred_rforest,y_test)
MLA_accuracy[nrow(MLA_accuracy)+1,] <- list('RForest',cm_rforest$overall['Accuracy'])  # To "MLA-Accuracy"

# Accuracy results:
print("MLA Accuracy Results:")
print(MLA_accuracy[order(-MLA_accuracy$Accuracy), ])

“There are no Tentative attributes! Returning original object.”


           meanImp medianImp    minImp    maxImp normHits  decision
Sex      29.973448 29.899924 28.681728 32.792260        1 Confirmed
Pclass   29.014857 29.095628 26.398744 31.593829        1 Confirmed
Title    26.586996 26.388754 25.149277 29.036027        1 Confirmed
Fare     25.003639 25.213584 23.566298 25.916207        1 Confirmed
SibSp    15.663664 15.983752 13.147619 17.254656        1 Confirmed
Age      14.916491 14.864870 12.581940 17.811070        1 Confirmed
Cabin    14.410920 13.866709 12.522547 17.726039        1 Confirmed
Embarked  9.633698  9.351308  6.862977 12.679042        1 Confirmed
Parch     6.798006  6.556574  5.416862  7.870987        1 Confirmed
[1]	train-rmse:0.411122 
[2]	train-rmse:0.356199 
[3]	train-rmse:0.316868 
[4]	train-rmse:0.292126 
[5]	train-rmse:0.278784 
[6]	train-rmse:0.271560 
[7]	train-rmse:0.259003 
[8]	train-rmse:0.244513 
[9]	train-rmse:0.240038 
[10]	train-rmse:0.233490 


“The response has five or fewer unique values.  Are you sure you want to do regression?”


[1] "MLA Accuracy Results:"
      MLA  Accuracy
6     ADA 0.7865169
1  LogReg 0.7696629
5   DTree 0.7584270
3     SVM 0.7528090
8 RForest 0.7528090
7     KNN 0.7471910
2 XGBoost 0.7359551
4      NB 0.7359551


In [11]:
# Too heavy for a fast run on Kaggle's CPUs


# Tunning the MLA parameters 
fitControl = trainControl(
  method = 'LOOCV',
  number = 5,
  #repeats = 10,
  savePredictions = 'all',
  #classProbs = TRUE
)
TrainData <- training_set
TrainData$Title <- make.names(TrainData$Title)
TrainData$Sex <- make.names(TrainData$Sex)
TrainData$Survived <- make.names(TrainData$Survived)

gmbGrid = expand.grid(
  iter = c(45,50,55),
  nu=0.1,
  maxdepth = c(2,3,4,5)
)

#ML_methods = c('ada', 'xgbLinear')

gbmFit1 <- train(Survived~., TrainData, 
                 method = 'ada', 
                 trControl = fitControl
                 ## This last option is actually one
                 ## for gbm() that passes through
                 #verbose = FALSE
                 #tuneGrid = gmbGrid
                 )

gbmFit1




ERROR: Error in parse(text = x, srcfile = src): <text>:3:3: unexpected INCOMPLETE_STRING
35: 
36: 
      ^


In [None]:
# Next we'll need to try Yandex's ML product - Catboost