# Loading Packages

In [None]:
library(dplyr)
library(reshape)
library(data.table)
library(data.table)
library(formattable)
library(gridExtra)
library(funModeling)
library(IRdisplay)

# Loading Data

In [None]:
input_dir = '../input/'
csv_files = list.files(input_dir, recursive = T, full.names = T)
csv_files = csv_files[grep('.csv', csv_files)]
csv_files

In [None]:
data = read.csv(csv_files[1], stringsAsFactors = F)

# Exploratory Data Analysis

## Structure & Dimension

In [None]:
dim(data)
head(data)

## Profiling Data Input

Probably one of the first steps, when we get a new dataset to analyze, is to know if there are missing values (NA in R) and the data type.

The **df_status** function coming in funModeling can help us by showing these numbers in relative and percentage values. It also retrieves the infinite and zeros statistics.

* **q_zeros:** quantity of zeros (p_zeros: in percent)
* **q_inf:** quantity of infinite values (p_inf: in percent)
* **q_na:** quantity of NA (p_na: in percent)
* **type:** factor or numeric
* **unique:** quantity of unique values

In [None]:
df_status(data)

## Missing Value Analysis

In [None]:
plot_missing <- function(data, title = NULL, ggtheme = theme_gray(), theme_config = list("legend.position" = c("bottom"))) {
  ## Declare variable first to pass R CMD check
  feature <- num_missing <- pct_missing <- group <- NULL
  ## Check if input is data.table
  is_data_table <- is.data.table(data)
  ## Detect input data class
  data_class <- class(data)
  ## Set data to data.table
  if (!is_data_table) data <- data.table(data)
  ## Extract missing value distribution
  missing_value <- data.table(
    "feature" = names(data),
    "num_missing" = sapply(data, function(x) {sum(is.na(x))})
  )
  missing_value[, feature := factor(feature, levels = feature[order(-rank(num_missing))])]
  missing_value[, pct_missing := num_missing / nrow(data)]
  missing_value[pct_missing < 0.05, group := "Good"]
  missing_value[pct_missing >= 0.05 & pct_missing < 0.4, group := "OK"]
  missing_value[pct_missing >= 0.4 & pct_missing < 0.8, group := "Bad"]
  missing_value[pct_missing >= 0.8, group := "Remove"][]
  ## Set data class back to original
  if (!is_data_table) class(missing_value) <- data_class
  ## Create ggplot object
  output <- ggplot(missing_value, aes_string(x = "feature", y = "num_missing", fill = "group")) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(round(100 * pct_missing, 2), "%"))) +
    scale_fill_manual("Group", values = c("Good" = "#1a9641", "OK" = "#a6d96a", "Bad" = "#fdae61", "Remove" = "#d7191c"), breaks = c("Good", "OK", "Bad", "Remove")) +
    scale_y_continuous(labels = comma) +
    coord_flip() +
    xlab("Features") + ylab("Number of missing rows") +
    ggtitle(title) +
    ggtheme + theme_linedraw()+
    do.call(theme, theme_config)
  ## Print plot
  print(output)
  ## Set return object
  return(invisible(missing_value))
}

In [None]:
plot_missing(data)

## Categorical Feature Analysis

In [None]:
getDataFrameWith50Categories <- function(df){
    factorDF <- mutate_all(df, function(x) as.factor(x))
    features <- names(factorDF)
    for(feature in features){
        if(length(levels(factorDF[,feature]))>50){
            factorDF[feature] <- NULL
        }
        
    }
   factorDF         
}
                           
categoricalData <- getDataFrameWith50Categories(data)

### Describe Categorical Features

In [None]:
describe(categoricalData)

### Categorical Features Plotting

In [None]:
freq <- function(data, input=NA, str_input=NA, plot=TRUE, na.rm=FALSE, path_out)
{
	if(!missing(str_input))
	{
		input=str_input
		.Deprecated(msg="Parameter 'str_input' will be deprecated, please use 'input' insted (only name changed, not its functionality)")
	}

	if(missing(path_out)) path_out=NA

  ## If input is NA then it runs for all variables in case it is not a single vector
	if(sum(is.na(input)>0))
	{
  	# True if it is a single vector
  	if(mode(data) %in% c("logical","numeric","complex","character"))
  	{
  		data=data.frame(var=data)
  		input="var"
  	} else {
			## Keeping all categorical variables
  		data=data.frame(data)
			status=df_status(data, print_results = F)
			input=status[status$type %in% c("factor", "character"), 'variable']
			if(length(input)==0)
				stop("None of the input variables are factor nor character")

  	}
	}

	## Iterator
	tot_vars=length(input)
	if(tot_vars==1)
	{
        display_markdown(paste("<h1><center>Categorical Feature : ",input,"</h1></center>"))
		res=freq_logic(data = data, input=input, plot, na.rm, path_out = path_out)
		return(res)
	} else {
		for(i in 1:tot_vars)
		{
			res=freq_logic(data = data, input=input[i], plot, na.rm, path_out = path_out)  
            print(paste("Categorical Feature :",input[i]))
			print(res)
			cat("", sep="\n")
		}

		#return(sprintf("Variables processed: %s", paste(input, collapse = ", ")))

	}

}

freq_logic <- function(data, input, plot, na.rm, path_out)
{
	if(!na.rm) {
		# if exclude = NULL then it adds the NA cases
		tbl=data.frame(table(factor(data[[input]], exclude = NULL)))
	} else {
		tbl=data.frame(table(data[[input]]))
	}
    
	tbl=dplyr::rename(tbl, category=Var1, frequency=Freq) %>% arrange(-frequency)
	tbl$percentage=round(100*tbl$frequency/sum(tbl$frequency),2)
	tbl$cumulative_perc=cumsum(tbl$percentage)
	tbl$cumulative_perc[length(tbl$cumulative_perc)]=100.00

	## calculating best font size
	uq=nrow(tbl)
	if(uq<=10)
	{
		letter_size=3
		axis_size=12
	} else if(uq<=20){
		letter_size=2.5
		axis_size=10
	} else {
		letter_size=2
		axis_size=8
	}

	if(plot)
	{
		# Plot
		tbl_plot=tbl
		tbl_plot$label=sprintf('%s (%s%%)', tbl_plot$frequency, tbl_plot$percentage)

		tbl_plot$category=factor(tbl_plot$category, levels =  tbl_plot$category[order(tbl_plot$percentage)])


		if(nrow(tbl_plot)<200)
		{
			p=ggplot(tbl_plot,aes(x=tbl_plot$category,y=tbl_plot$frequency,fill=tbl_plot$category, label=label)) +
				geom_bar(stat='identity') + coord_flip() +	theme_bw() +
				theme(
					panel.grid.minor=element_blank(),
						panel.grid.major =element_blank(),
					legend.title=element_blank(),
					plot.title = element_text(vjust=2),
					axis.ticks.y=element_blank(),
					axis.ticks.x=element_blank(),
					axis.text.x=element_blank(),
					axis.text.y=element_text(size=axis_size),
					axis.title.x=element_text(size=12, margin=margin(10,0,0,0)),
					axis.title.y=element_text(size=14, margin=margin(0,10,0,0))
				) + ylab("Frequency / (Percentage %)") + xlab(input) +
				geom_text( color="#151515", size=letter_size, hjust=-.06) +
				guides(fill=F) +
				scale_y_continuous(expand = c(0,0),limits = c(0, max(tbl_plot$frequency)*1.2))

			## Save plot
			if(!is.na(path_out))
			{
				dir.create(path_out, showWarnings = F)

				if(dir.exists(path_out))
				{
					jpeg(sprintf("%s/%s.jpeg", path_out, input), width= 12.25, height= 6.25, units="in",res=200, quality = 90)

					plot(p)
					dev.off()
				} else {
					warning(sprintf("The directory '%s' doesn't exists.", path_out))
				}
			} else {
				plot(p)
			}

		} else {
			message_high_card=sprintf("Skipping plot for variable '%s' (more than 200 categories)", input)
		}

	}

	colnames(tbl)[1]=input
	tbl[[input]]=as.character(tbl[[input]])

	if(exists("message_high_card")) {warning(message_high_card)}

	return(tbl)
}

In [None]:
freq(categoricalData)

## Numerical Feature Analysis

In [None]:
getNumericalDF <- function(df){
    numericDF <- df
    features <- names(numericDF)
    for(feature in features){
        if(!is.numeric(df[,feature])){
            numericDF[feature] <- NULL
        }
    }
    numericDF
}

In [None]:
numericalData <- getNumericalDF(data)

* **variable:** variable name

* **mean:** the well-known mean or average

* **std_dev:** standard deviation, a measure of dispersion or spread around the mean value. A value around 0 means almost no variation (thus, it seems more like a constant); on the other side, it is harder to set what high is, but we can tell that the higher the variation the greater the spread. Chaos may look like infinite standard variation. The unit is the same as the mean so that it can be compared.

* **variation_coef:** variation coefficient=std_dev/mean. Because the std_dev is an absolute number, it’s good to have an indicator that puts it in a relative number, comparing the std_dev against the mean A value of 0.22 indicates the std_dev is 22% of the mean If it were close to 0 then the variable tends to be more centered around the mean. If we compare two classifiers, then we may prefer the one with less std_dev and variation_coef on its accuracy.

* **p_01, p_05, p_25, p_50, p_75, p_95, p_99:** Percentiles at 1%, 5%, 25%, and so on. Later on in this chapter is a complete review about percentiles.

In [None]:
profiling_num(numericalData)

### Numerical Feature Plotting (Histogram)

In [None]:
plot_histogram <- function(data, title = NULL, ggtheme = theme_gray(), theme_config = list(), ...) {
  if (!is.data.table(data)) data <- data.table(data)
  ## Stop if no continuous features
  if (split_columns(data)$num_continuous == 0) stop("No Continuous Features")
  ## Get continuous features
  continuous <- split_columns(data)$continuous
  ## Get dimension
  n <- nrow(continuous)
  p <- ncol(continuous)
  ## Calculate number of pages
  pages <- ceiling(p / 16L)
  for (pg in seq.int(pages)) {
    ## Subset data by column
    subset_data <- continuous[, seq.int(16L * pg - 15L, min(p, 16L * pg)), with = FALSE]
    setnames(subset_data, make.names(names(subset_data)))
    n_col <- ifelse(ncol(subset_data) %% 4L, ncol(subset_data) %/% 4L + 1L, ncol(subset_data) %/% 4L)
    ## Create ggplot object
    plot <- lapply(
      seq_along(subset_data),
      function(j) {
        x <- na.omit(subset_data[, j, with = FALSE])
        ggplot(x, aes_string(x = names(x))) +
          geom_histogram(bins = 30L, ...,fill='#92b7ef') +
          scale_x_continuous(labels = comma) +
          scale_y_continuous(labels = comma) +
          ylab("Frequency") +
          ggtheme + theme_linedraw()+
          do.call(theme, theme_config)
      }
    )
    ## Print plot object
    if (pages > 1) {
      suppressWarnings(do.call(grid.arrange, c(plot, ncol = n_col, nrow = 4L, top = title, bottom = paste("Page", pg))))
    } else {
      suppressWarnings(do.call(grid.arrange, c(plot, top = title)))
    }
  }
}

.getAllMissing <- function(dt) {
  if (!is.data.table(dt)) dt <- data.table(dt)
  sapply(dt, function(x) {
    sum(is.na(x)) == length(x)
  })
}

split_columns <- function(data) {
  ## Check if input is data.table
  is_data_table <- is.data.table(data)
  ## Detect input data class
  data_class <- class(data)
  ## Set data to data.table
  if (!is_data_table) data <- data.table(data)
  ## Find indicies for continuous features
  all_missing_ind <- .getAllMissing(data)
  ind <- sapply(data[, which(!all_missing_ind), with = FALSE], is.numeric)
  ## Count number of discrete, continuous and all-missing features
  n_all_missing <- sum(all_missing_ind)
  n_continuous <- sum(ind)
  n_discrete <- ncol(data) - n_continuous - n_all_missing
  ## Create object for continuous features
  continuous <- data[, which(ind), with = FALSE]
  ## Create object for discrete features
  discrete <- data[, which(!ind), with = FALSE]
  ## Set data class back to original
  if (!is_data_table) class(discrete) <- class(continuous) <- data_class
  ## Set return object
  return(
    list(
      "discrete" = discrete,
      "continuous" = continuous,
      "num_discrete" = n_discrete,
      "num_continuous" = n_continuous,
      "num_all_missing" = n_all_missing
    )
  )
}

In [None]:
plot_histogram(numericalData)

### Numerical Feature Plotting (Density)

In [None]:
plot_density <- function(data, title = NULL, ggtheme = theme_gray(), theme_config = list(), ...) {
  if (!is.data.table(data)) data <- data.table(data)
  ## Stop if no continuous features
  if (split_columns(data)$num_continuous == 0) stop("No Continuous Features")
  ## Get continuous features
  continuous <- split_columns(data)$continuous
  ## Get dimension
  n <- nrow(continuous)
  p <- ncol(continuous)
  ## Calculate number of pages
  pages <- ceiling(p / 16L)
  for (pg in seq.int(pages)) {
    ## Subset data by column
    subset_data <- continuous[, seq.int(16L * pg - 15L, min(p, 16L * pg)), with = FALSE]
    setnames(subset_data, make.names(names(subset_data)))
    n_col <- ifelse(ncol(subset_data) %% 4L, ncol(subset_data) %/% 4L + 1L, ncol(subset_data) %/% 4L)
    ## Create ggplot object
    plot <- lapply(
      seq_along(subset_data),
      function(j) {
        x <- na.omit(subset_data[, j, with = FALSE])
        ggplot(x, aes_string(x = names(x))) +
          geom_density(...,fill="#e2c5e5") +
          scale_x_continuous(labels = comma) +
          scale_y_continuous(labels = percent) +
          ylab("Density") +
          ggtheme + theme_linedraw()+
          do.call(theme, theme_config)
      }
    )
    ## Print plot object
    if (pages > 1) {
      suppressWarnings(do.call(grid.arrange, c(plot, ncol = n_col, nrow = 4L, top = title, bottom = paste("Page", pg))))
    } else {
      suppressWarnings(do.call(grid.arrange, c(plot, top = title)))
    }
  }
}

In [None]:
plot_density(numericalData)