In [None]:
import shlex
import subprocess
import tempfile

In [None]:
r_factor_analysis_code = """
#!/usr/bin/env Rscript

library(R.utils, quietly=TRUE, warn.conflicts=FALSE)
library(data.table)

factorCorrelations <- function(x) {
# Reproduces standard output of efa for saving to csv
# [based on http://tolstoy.newcastle.edu.au/R/devel/05/06/1414.html]
    tmat<-solve(x$rotmat)
    R <- tmat %*% t(tmat)
    factors <- x$factors
    rownames(R) <- colnames(R) <- paste("Factor", 1:factors, sep="")
    return(data.frame(R))
}

factorVariance <- function(x){
# Reproduces standard output of efa for saving to csv
# [based on http://tolstoy.newcastle.edu.au/R/devel/05/06/1414.html]
    Lambda <- unclass(x)
    factors <- ncol(Lambda)
    p <- nrow(Lambda)
    vx <- colSums(x^2)
    varex <- rbind("SS loadings" = vx)
    if(is.null(attr(x, "covariance"))) {
        varex <- rbind(varex, "Proportion Var" = vx/p)
        if(factors > 1)
            varex <- rbind(varex, "Cumulative Var" = cumsum(vx/p))

    }
}

factorResults<-function(train.efa, fn) {
    efa = list()
    tryCatch(efa$analysis <- factanal(train.efa, fn, rotation='promax'), error = function(e) e)
    if (typeof(efa$analysis) == 'list') {
        # sort by factor values
        efa$load = data.frame(efa$analysis$loadings[, 1:fn])
        # sort by the first three factors
        efa$load = efa$load[order(-efa$load$Factor1, -efa$load$Factor2, -efa$load$Factor3), ]
        efa$fcor = factorCorrelations(efa$analysis)
        efa$fv = factorVariance(efa$analysis$loadings)
    } else {
        efa = list()
    }
    return(efa)
}

# a generic function to run exploratory factor analysis (using promax rotation)
# on the given data using the given number of factors, and save the results to disk
analyzeFactors <- function(train.file, experiment.id, csv.dir, fig.dir, num.factors) {

    # read in the training data into a data frame
    train <- fread(train.file, header=TRUE, stringsAsFactors = FALSE)
    train <- data.frame(train)

    # get all the columns except the id column which
    # we do not want as a feature
    train.feats <- train[!colnames(train) %in% c('spkitemid', 'sc1')]

    # compute the factor analysis on the data
    efa = factorResults(train.feats, num.factors)

    # define the output names
    factor_loadings_file = paste0(csv.dir, '/', experiment.id, '_efa', num.factors, '.csv' )
    factor_corr_file = paste0(csv.dir, '/', experiment.id, '_efa', num.factors, 'fcor.csv' )
    factor_variance_file = paste0(csv.dir, '/', experiment.id, '_efa', num.factors, 'fv.csv' )

    # write out the various parts of the factor analysis
    write.csv(efa$load, file=factor_loadings_file)
    write.csv(efa$fcor, file=factor_corr_file)
    write.csv(efa$fv, file=factor_variance_file)
}

args <- R.utils::commandArgs(asValues=TRUE, adhoc=TRUE, trailingOnly=TRUE)

if (!('infile' %in% names(args) & 'expid' %in% names(args) & 'outcsv' %in% names(args) & 'outfig' %in% names(args) & 'nf' %in% names(args))) {
    stop('You need to specify (a) the CSV file containing the features, (b) the experiment id to be used as a prefix to model files, (c) a directory to save the textual outputs, (d) a directory to save figures, and (e) the number of factors. \n Usage: analyzeFactors.R -trainf train_file -expid experiment_id -outcsv csv_dir -outfig fig_dir -nf num_factors. \n ')
    }

analyzeFactors(args$infile, args$expid, args$outcsv, args$outfig, args$nf)

"""

In [None]:
# check to make sure that the preprocessed train features file exists
efa_input_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))
skip_section = not exists(efa_input_file)
if skip_section:
    display(Markdown("Section skipped because pre-processed training features were not available.".format(efa_input_file)))
else:
    display(Markdown('## Factor analysis'))

    # write out the above R code into a temporary script
    temp_r_script = tempfile.NamedTemporaryFile(delete=False, mode='w')
    temp_r_script.write(r_factor_analysis_code)
    temp_r_script.close() 

    # Perform exploratory factor analysis on the features 
    # contained in the input file using the given number
    # of factors for e-rater datasets.
    num_factors = 3

    r_script_path = temp_r_script.name
    efa_input_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))


    cmd = 'Rscript {} -infile {} -expid {} -outcsv {} -outfig {} -nf {}'.format(r_script_path, efa_input_file, 
                                                                                experiment_id, output_dir, 
                                                                                figure_dir, num_factors)
    cmd = shlex.split(cmd)

    pipe = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, stderr = pipe.communicate()

    # raise an exception if the R command didn't work
    m = re.search(r'Error in .*', stderr.decode('utf-8'), flags=re.S)
    if m:
        r_error = m.group()
        raise Exception('R encountered an error during '
                        'factor analysis {}: "\n{}"'.format( r_error))

    df_efa3 = pd.read_csv(join(output_dir, '{}_efa3.csv'.format(experiment_id)), index_col=0)
    df_efa3fv = pd.read_csv(join(output_dir, '{}_efa3fv.csv'.format(experiment_id)), index_col=0)
    df_efa3fcor = pd.read_csv(join(output_dir, '{}_efa3fcor.csv'.format(experiment_id)), index_col=0)

    if df_efa3.empty:
        display(Markdown('The factor analysis did not yield any results. You may have too few features.'))
    else:
        markdown_str = """The following tables show the output of exploratory factor analysis on the training data. The analysis uses maximum likelihood with 
        three factors and oblique rotation (promax), and are computed after standardization and outlier truncation."""    
        display(Markdown(markdown_str))
        display(Markdown('#### Factor loadings'))
        display(Markdown('*Note*: loadings with absolute value > 0.1 are marked in bold.'))
        formatter = partial(bold_highlighter, high=0.1, absolute=True)
        display(HTML(df_efa3.to_html(classes=['sortable'], index=True, float_format=formatter, escape=False)))
        display(Markdown('#### Eigenvalues and cumulative variance accounted for by each factor'))
        display(HTML(df_efa3fv.to_html(classes=['sortable'], index=True, float_format=float_format_func)))
        display(Markdown('#### Factor correlations'))
        display(HTML(df_efa3fcor.to_html(classes=['sortable'], index=True, float_format=float_format_func)))

    # delete the temporary R script
    os.unlink(temp_r_script.name)