In [1]:
# Data processing script
# 
# assign working directory path
wd.path <- "~/in"
setwd(wd.path)
set.seed(1234)
library(dplyr)
library(data.table)            # to get fread
# library(foreach)
# library(caret)
# library(reshape2)
library(rms)
library(WeightedROC)

#Sample mode function from Source: https://stackoverflow.com/questions/2547402/is-there-a-built-in-function-for-finding-the-mode
Mode <- function(x, na.rm = FALSE) {
  if (na.rm) {
    x = x[!is.na(x)]
  }
  
  ux <- unique(x)
  return(ux[which.max(tabulate(match(x, ux)))])
}

imputationFunction <- function(imputeToData, imputeFromData, FUN, missingCols, suffix){
  # imputeToData -  Imputation to be done on this data
  # imputeFromData - Imputations calculation from this data
  # FUN - imputation function
  # missingCols - missing value column names
  # suffix - suffix to add after column name
  
  imputeToData <- imputeToData[, names(imputeToData) %in% missingCols]
  imputeFromData <- imputeFromData[, names(imputeFromData) %in% missingCols]
  imputeVec <- apply(imputeFromData, 2, function(x) FUN(x, na.rm = T))
  
  for (i in 1:length(missingCols)) {
    imputeToxData[is.na(imputeToData[, names(imputeToData) %in% missingCols[i]])
                 , names(imputeToData) %in% missingCols[i]] <- imputeVec[names(imputeVec) %in% missingCols[i]]
  }
  
  names(imputeToData) <- paste0(names(imputeToData), suffix)
  return(imputeToData)
}


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

Loading required package: Hmisc
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Loading required package: ggplot2

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:dplyr’:

    combine, src, summarize

The following objects are masked from ‘package:base’:

    format.pval, round.POSIXt, trunc.POSIXt, units

Loading required package: SparseM

Attaching package: ‘SparseM’

The following object is masked from ‘package:base’:

    backsolve



# Initial Data Processing: join in Weight and Fold

In [4]:
# Find out the memory usage of each item
sort(sapply(ls(), function(x) format(object.size(get(x)), unit = 'auto')))

In [3]:
print(Sys.time())
train <- fread('train_modified.csv') # applications test data
test <- fread('test_modified.csv') # applications train data
print(Sys.time())

# storing column names for later
saveNames <- names(train)

# columns with missing values
missingCols <-  names(train)[apply(train, 2, function(x) sum(is.na(x))) > 0]

[1] "2018-08-28 12:58:46 UTC"
Read 307511 rows and 3205 (of 3205) columns from 4.639 GB file in 00:01:53
Read 48744 rows and 3202 (of 3202) columns from 0.751 GB file in 00:00:15
[1] "2018-08-28 13:00:53 UTC"


Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,2216381,118.4,3886542,207.6,2637877,140.9
Vcells,972410063,7419.0,1227364503,9364.1,981527564,7488.5


In [None]:
print(missingCols)

In [None]:
print(head(train$Weights))
print(head(train$fold))
# already there no need to load the weights

# Prepare Data Imputation

In [None]:
# Mechanically create a formula for imputation
M = train

col_set = M %>% names()
col_set = col_set[!col_set %in% "TARGET"]

string_formula = "TARGET~"
for(i in col_set){
  if(class(M[[i]]) == "numeric"){
    tmp_component = paste0("+",i)
  }
  if(class(M[[i]]) != "numeric"){
    tmp_component = paste0("+",i)
  }
  string_formula = paste0(string_formula, tmp_component)  
}
requested_formula = as.formula(string_formula)
requested_formula

In [None]:
na.patterns <- naclus(train)
naplot(na.patterns, 'na per var')

In [None]:
# Manually convert some of the 'Unknown' levels to NA
train_imp = train
train_imp[train_imp == 'XNA' | train_imp == 'Unknown'] <- NA
train_imp = droplevels(train_imp)

# Code example to see the use of dropLevels, dropping factor level with no observation
# train2 <- subset(train_imp, select= c(CODE_GENDER, NAME_FAMILY_STATUS))
# str(train2)
# train2 = droplevels(train2)
# str(train2)

In [None]:
train_imp_small <- train[1:5, c('TARGET','NAME_CONTRACT_TYPE','CODE_GENDER',
    'FLAG_OWN_CAR','FLAG_OWN_REALTY','CNT_CHILDREN','AMT_INCOME_TOTAL',
    'AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','NAME_TYPE_SUITE',
    'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE','REGION_POPULATION_RELATIVE','DAYS_BIRTH',
    'DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','OWN_CAR_AGE',
    'FLAG_MOBIL','FLAG_EMP_PHONE')]
train_imp_small[train_imp_small == 'XNA' | train_imp_small == 'Unknown'] <- NA
train_imp_small = droplevels(train_imp_small)

In [None]:
set.seed(17) # so can reproduce random aspects
mi <- aregImpute(~ TARGET + NAME_CONTRACT_TYPE + CODE_GENDER + 
    #FLAG_OWN_CAR + 
                 FLAG_OWN_REALTY + CNT_CHILDREN + AMT_INCOME_TOTAL + 
    AMT_CREDIT + AMT_ANNUITY + AMT_GOODS_PRICE + NAME_TYPE_SUITE + 
    #NAME_INCOME_TYPE + 
                 NAME_EDUCATION_TYPE + NAME_FAMILY_STATUS + 
    NAME_HOUSING_TYPE + REGION_POPULATION_RELATIVE + DAYS_BIRTH + 
    DAYS_EMPLOYED + DAYS_REGISTRATION + DAYS_ID_PUBLISH + OWN_CAR_AGE + 
    FLAG_MOBIL + FLAG_EMP_PHONE + FLAG_WORK_PHONE + FLAG_CONT_MOBILE + 
    FLAG_PHONE + FLAG_EMAIL + OCCUPATION_TYPE + CNT_FAM_MEMBERS, data=train_imp, n.impute=10, nk=4, pr=FALSE)

In [None]:
 + 
    REGION_RATING_CLIENT + REGION_RATING_CLIENT_W_CITY + WEEKDAY_APPR_PROCESS_START + 
    HOUR_APPR_PROCESS_START + REG_REGION_NOT_LIVE_REGION + REG_REGION_NOT_WORK_REGION + 
    LIVE_REGION_NOT_WORK_REGION + REG_CITY_NOT_LIVE_CITY + REG_CITY_NOT_WORK_CITY + 
    LIVE_CITY_NOT_WORK_CITY + ORGANIZATION_TYPE + EXT_SOURCE_1 + 
    EXT_SOURCE_2 + EXT_SOURCE_3 + APARTMENTS_AVG + BASEMENTAREA_AVG + 
    YEARS_BEGINEXPLUATATION_AVG + YEARS_BUILD_AVG + COMMONAREA_AVG + 
    ELEVATORS_AVG + ENTRANCES_AVG + FLOORSMAX_AVG + FLOORSMIN_AVG + 
    LANDAREA_AVG + LIVINGAPARTMENTS_AVG + LIVINGAREA_AVG + NONLIVINGAPARTMENTS_AVG + 
    NONLIVINGAREA_AVG + APARTMENTS_MODE + BASEMENTAREA_MODE + 
    YEARS_BEGINEXPLUATATION_MODE + YEARS_BUILD_MODE + COMMONAREA_MODE + 
    ELEVATORS_MODE + ENTRANCES_MODE + FLOORSMAX_MODE + FLOORSMIN_MODE + 
    LANDAREA_MODE + LIVINGAPARTMENTS_MODE + LIVINGAREA_MODE + 
    NONLIVINGAPARTMENTS_MODE + NONLIVINGAREA_MODE + APARTMENTS_MEDI + 
    BASEMENTAREA_MEDI + YEARS_BEGINEXPLUATATION_MEDI + YEARS_BUILD_MEDI + 
    COMMONAREA_MEDI + ELEVATORS_MEDI + ENTRANCES_MEDI + FLOORSMAX_MEDI + 
    FLOORSMIN_MEDI + LANDAREA_MEDI + LIVINGAPARTMENTS_MEDI + 
    LIVINGAREA_MEDI + NONLIVINGAPARTMENTS_MEDI + NONLIVINGAREA_MEDI + 
    FONDKAPREMONT_MODE + HOUSETYPE_MODE + TOTALAREA_MODE + WALLSMATERIAL_MODE + 
    EMERGENCYSTATE_MODE + OBS_30_CNT_SOCIAL_CIRCLE + DEF_30_CNT_SOCIAL_CIRCLE + 
    OBS_60_CNT_SOCIAL_CIRCLE + DEF_60_CNT_SOCIAL_CIRCLE + DAYS_LAST_PHONE_CHANGE + 
    FLAG_DOCUMENT_2 + FLAG_DOCUMENT_3 + FLAG_DOCUMENT_4 + FLAG_DOCUMENT_5 + 
    FLAG_DOCUMENT_6 + FLAG_DOCUMENT_7 + FLAG_DOCUMENT_8 + FLAG_DOCUMENT_9 + 
    FLAG_DOCUMENT_10 + FLAG_DOCUMENT_11 + FLAG_DOCUMENT_12 + 
    FLAG_DOCUMENT_13 + FLAG_DOCUMENT_14 + FLAG_DOCUMENT_15 + 
    FLAG_DOCUMENT_16 + FLAG_DOCUMENT_17 + FLAG_DOCUMENT_18 + 
    FLAG_DOCUMENT_19 + FLAG_DOCUMENT_20 + FLAG_DOCUMENT_21 + 
    AMT_REQ_CREDIT_BUREAU_HOUR + AMT_REQ_CREDIT_BUREAU_DAY + 
    AMT_REQ_CREDIT_BUREAU_WEEK + AMT_REQ_CREDIT_BUREAU_MON + 
    AMT_REQ_CREDIT_BUREAU_QRT + AMT_REQ_CREDIT_BUREAU_YEAR

In [None]:
test <- read.csv('application_test.csv') # applications test data
train <- read.csv('application_train.csv') # applications train data

# storing column names for later
saveNames <- names(train)

# columns with missing values
missingCols <-  names(train)[apply(train, 2, function(x) sum(is.na(x))) > 0]
                                   
#train imputation
medianDatatr <- imputationFunction(imputeToData = train, imputeFromData = train, FUN = median
                                   , missingCols = missingCols, suffix = '.trmedian')

#test imputation
medianDatatst.tst <- imputationFunction(imputeToData = test, imputeFromData = test, FUN = median
                                    , missingCols = missingCols, suffix = '.tstmedian')

## Initial Transformation to get to the Imputed Dataset

In [None]:
weights <- read.csv('Weights_and_fold.csv')
print("Weights loaded")
train <- cbind(train, medianDatatr) %>% left_join(weights, by = "SK_ID_CURR")
write.csv(train,'application_train_imp.csv')
test <- cbind(test, medianDatatst.tst)
write.csv(test,'application_test_imp.csv')
rm(medianDatatr, medianDatatst.tst, weights)

In [None]:
weights <- read.csv('V2_weight_R_code(Alex)/Train_Weights_V2.csv')
train <- read.csv('application_train_imp.csv') # applications train data
train <- train %>% select(-Weights, -X) %>% left_join(weights, by = "SK_ID_CURR") %>% select(-X)
write.csv(train,'application_train_imp.csv')

## Start Here for Subsequent Runs, read in the processed dataset

In [None]:
test <- read.csv('application_test_imp.csv') # applications test data
train <- read.csv('application_train_imp.csv') # applications train data

In [None]:
# Mechanically create a formula for rms modeling
M = train

col_set = M %>% names()
col_set = col_set[!col_set %in% "TARGET"]

string_formula = "TARGET~"
for(i in col_set){
  if(class(M[[i]]) == "numeric"){
    tmp_component = paste0("+rcs(",i,",3)")
  }
  if(class(M[[i]]) != "numeric"){
    tmp_component = paste0("+",i)
  }
  string_formula = paste0(string_formula, tmp_component)  
}
requested_formula = as.formula(string_formula)
requested_formula

### Base Model + AMT_CREDIT (Model FF_rms_2)

In [None]:
#Perform 5 fold cross validation
model = list()
mod_anova = list()
test_roc = list()

for(i in 1:5){
    testData <- train[train$fold==i,]
    trainData <- train[train$fold!=i,]
    string_formula <- TARGET ~ NAME_CONTRACT_TYPE + CODE_GENDER + FLAG_OWN_CAR + FLAG_OWN_REALTY + rcs(EXT_SOURCE_1.trmedian, 4) + 
            rcs(EXT_SOURCE_2.trmedian, 4) + rcs(EXT_SOURCE_3.trmedian,4) + rcs(AMT_CREDIT, 3)
    requested_formula = as.formula(string_formula)
    a_model = lrm(data=trainData, formula = requested_formula, weight=Weights, tol=1E-10, x=TRUE,y=TRUE)
        
    # diagnostic plots
    ## Variable Importance
    mod_anova[[i]] <- anova(a_model)
    
    ## Validation Statistics
    print(validate(a_model, B=10))
    model[[i]] <- a_model

    ## Calibration Curve
    # plot(calibrate(a_model, B=10))

    # use a fitted model to score a dataset, convert score to probability
    test_scored <- cbind(testData, predict(a_model, testData, se.fit=TRUE))
    test_scored$pred = exp(test_scored$linear.predictors)/(1+exp(test_scored$linear.predictors))

    # Weighted AUROC
    tp.fp <- WeightedROC(test_scored$pred,test_scored$TARGET,test_scored$Weights)
    test_roc[[i]] <- WeightedAUC(tp.fp)
    print(test_roc[[i]])

    # Unweighted AUROC
    tp.fp <- WeightedROC(test_scored$pred,test_scored$TARGET)
    print(WeightedAUC(tp.fp))
    
}

In [None]:
#Perform 5 fold cross validation
model = list()
mod_anova = list()
test_roc = list()

for(i in 1:5){
    testData <- train[train$fold==i,]
    trainData <- train[train$fold!=i,]
    string_formula <- TARGET ~ NAME_CONTRACT_TYPE + CODE_GENDER + FLAG_OWN_CAR + FLAG_OWN_REALTY + rcs(EXT_SOURCE_1.trmedian, 4) + 
            rcs(EXT_SOURCE_2.trmedian, 4) + rcs(EXT_SOURCE_3.trmedian,4)
    requested_formula = as.formula(string_formula)
    a_model = lrm(data=trainData, formula = requested_formula, weight=Weights, tol=1E-10, x=TRUE,y=TRUE)
        
    # diagnostic plots
    ## Variable Importance
    mod_anova[[i]] <- anova(a_model)
    
    ## Validation Statistics
    print(validate(a_model, B=10))
    model[[i]] <- a_model

    ## Calibration Curve
    # plot(calibrate(a_model, B=10))

    # use a fitted model to score a dataset, convert score to probability
    test_scored <- cbind(testData, predict(a_model, testData, se.fit=TRUE))
    test_scored$pred = exp(test_scored$linear.predictors)/(1+exp(test_scored$linear.predictors))

    # Weighted AUROC
    tp.fp <- WeightedROC(test_scored$pred,test_scored$TARGET,test_scored$Weights)
    test_roc[[i]] <- WeightedAUC(tp.fp)
    print(test_roc[[i]])

    # Unweighted AUROC
    tp.fp <- WeightedROC(test_scored$pred,test_scored$TARGET)
    print(WeightedAUC(tp.fp))
    
}

In [None]:
# Trying to understand why adding AMT_CREDIT, AMT_INCOME_TOTAL to the model would cause information matrix singularity
train_samp_col <- train %>% select(AMT_CREDIT, AMT_INCOME_TOTAL)
hist(train_samp_col)
# the value seems pretty ordinary

cor(trainData[,c("EXT_SOURCE_1.trmedian", "EXT_SOURCE_2.trmedian", "EXT_SOURCE_3.trmedian", "AMT_CREDIT", "AMT_INCOME_TOTAL")])

### Train FF_rms_2 on the full dataset and create submission

In [None]:
string_formula <- TARGET ~ NAME_CONTRACT_TYPE + CODE_GENDER + FLAG_OWN_CAR + FLAG_OWN_REALTY + rcs(EXT_SOURCE_1.trmedian, 4) + 
            rcs(EXT_SOURCE_2.trmedian, 4) + rcs(EXT_SOURCE_3.trmedian,4) + rcs(AMT_CREDIT, 3)
requested_formula = as.formula(string_formula)
a_model = lrm(data=train, formula = requested_formula, weight=Weights, tol=1E-10, x=TRUE,y=TRUE)

# Diagnostic Plots
## Variable Importance
plot(anova(a_model))
## Partial Effect Plot
dd <- datadist(train); options(datadist='dd')
ggplot(Predict(a_model),sepdiscrete='vertical',vnames='names')

In [None]:
a_model

In [None]:
test$EXT_SOURCE_1.trmedian = test$EXT_SOURCE_1.tstmedian
test$EXT_SOURCE_2.trmedian = test$EXT_SOURCE_2.tstmedian
test$EXT_SOURCE_3.trmedian = test$EXT_SOURCE_3.tstmedian

In [None]:
test_scored <- cbind(test, predict(a_model, test, se.fit=TRUE))
test_scored$TARGET <- exp(test_scored$linear.predictors)/(1+exp(test_scored$linear.predictors))
submission <- test_scored %>% select(SK_ID_CURR, TARGET)
write.csv(submission, "test_scored.csv")