In [1]:
library(tidyverse)
library(magrittr)
library(broom)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.8
✔ tidyr   0.8.2     ✔ stringr 1.3.1
✔ readr   1.2.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract



In [388]:
load('../data/artsengagement.Rda')
load('../data/open_response_data/docuscope_data.Rda')
load('../data/open_response_data/integ_complx.Rda')
load('../data/open_response_data/labeled_topic_prevs.Rda')
load('../data/open_response_data/liwcs.Rda')
ls()

In [389]:
load('../data/tidy_questions_best.Rda')

In [390]:
questions <- questions[question_names %in% names(thetas)]
names(questions) <- names(thetas)

---

In [391]:
# Fixing and updating topic names
names(thetas[['barriers']]) <- c('key', 'Competitive Auditions', 'Time_1', 'Cost', 'University Priorities', 'Distance Between Campuses', 'Accessibility', 'Departmental Segregation', 'Low Confidence', 'Unwelcoming Environment', 'Lack of Familiarity', 'No Barriers, Except...', 'Lack of Personal Interest', 'No Barriers', 'University Program Requirements', 'Fit With Major and Course Scarcity', 'Ennui', 'Awareness', 'Time_2', 'Course Load')
names(thetas[['feel']]) <- c('key', 'Culturally Aware', 'Entertained', 'Regretful', 'Wishful', 'Part of Something Bigger', 'Less Stressed and More Expressive', 'Grateful', 'Connected to Others through the Arts', 'Happy', 'Proud', 'Gained Perspective and Appreciation', 'Accomplished & Creative', 'Refreshed', 'Well Rounded')
names(thetas[['sr_development']]) <- c('key', 'Played a Role', 'Gained Skills', 'Became Part of Identity', 'New Social Bonds', 'Cultural Understanding', 'Strengthened Social Bonds', 'Deeper Appreciation', 'Finding Balance in Life', 'Supporting Friends', 'Didn\'t Play a Role')
names(thetas[['behavior2']])[2] <- 'Better Time Management'
names(thetas[['sr_othergrowth']]) <- c('key', 'More Open Minded', 'Personal Skills & Development', 'Self Exploration', 'Connect with People in New Ways', 'Exposure to New Experiences', 'Cultural Appreciation', 'Spiritually, Emotionally, Mentally, Physically', 'More Self-Aware', 'Creativity, Expression and Communication', 'Understanding and Accepting Others', 'Expanded World View')
names(thetas[['sr_role']]) <- c('key', 'New People & New Skills', 'Influenced Career Path', 'More Involved', 'Entertainment', 'Unable to Participate Much', 'Personal Growth', 'Break from Classes', 'Personal Enrichment', 'Fun Social Events', 'Small Role', 'Learning', 'Broadened World View', 'Meet New Friends')
names(thetas[[7]]) <- c('key', 'Unknown / None', 'Lifelong Participation', 'Attending Events 1', 'In Free Time', 'Attending Events 2', 'Misc. Participation', 'Taking Classes 1', 'Attending Events 3', 'Grad School', 'Through a Career', 'Creating', 'Through Work', 'Unsure but Hopeful', 'Community Groups', 'Through Life', 'Playing Instruments', 'Encouraging Children', 'Taking Classes 2')
names(thetas[[8]]) <- c('key', 'No Impact 1', 'Career Tradeoff Concerns', 'Art as a Career', 'Large Role', 'Undecided on Career', 'No Impact 2', 'Want Art to be Part of Future', 'Some Impact', 'Small Role', 'Looking for Creative Expression in Non-Artistic Career')



In [392]:
#save(x=thetas, file='../data/open_response_data/labeled_topic_prevs.Rda')

In [393]:
load('../data/open_response_data/labeled_topic_prevs.Rda')

----

Rename and remove punc. from theta variables

In [394]:
for(i in seq(thetas)) {
    names(thetas[[i]]) <- names(thetas[[i]]) %>% gsub('[.]','', x=.) %>% 
                                                gsub("&", 'and', x=.) %>% 
                                                gsub("/", '', x=.) %>%
                                                gsub("'", '', x=.) %>% 
                                                gsub("  ", ' ', x=.) %>% 
                                                gsub(" ", '_', x=.)
    for(j in seq(thetas[[i]])) {
        names(thetas[[i]])[j] <- paste(c(names(thetas)[i], 'topic', names(thetas[[i]])[j]), collapse = '..')
    }
}
for(i in seq(thetas)) {
    names(thetas[[i]])[1] <- 'key'
}

`df$key[!(df$key %in% thetas[[1]]$key)] %>% length == thetas[[1]] %>% nrow`

If false, then question was asked over multiple years and merged for stm
1. Remove stm docs removed from tidy questions,
1. Merge tidy questions and theta
1. find first and last response (df order) of each year to mark as boundaries
1. using those boundaries, segregate responses based on row index into years

In [395]:
library(stm)
load('../data/stm_settings.Rda')
load('../data/tidy_questions_best.Rda')

In [396]:
names(questions) <- question_names

In [397]:
mthetas <- c()
for(i in seq(thetas)) {
    if(select(df[df %>% sapply(is.character)], ends_with(names(thetas)[i])) %>% ncol > 1) {
        mthetas %<>% c(list(thetas[[i]]))
        names(mthetas)[i] <- names(thetas)[i]
    }       
}

In [398]:
mthetas %>% names

----

- df
- find year cutoffs via n responses each year
- remove responses in order of 
    1. missing from questions df
    1. procs\$removed
    1. docs\$removed
    - update year cutoffs by keeping track of how many docs are removed from each year
- split according to year cutoffs
- merge(..., by="key", all=T) x #years
- done

---
loop below

In [399]:
# needs yr_parts variable
update_parts <- function(yr_parts, idxs) {
    for(i in seq(yr_parts)) {
        res <- idxs %>% sapply(function(idx)(idx < yr_parts[i])) %>% sum
        yr_parts[i] <- yr_parts[i] - res
    }
    return(yr_parts)
}

In [400]:
# needs df_qs variable
remove_idxs <- function(df_qs, idxs) {
    df_qs <- df_qs[-idxs, ]
    return(df_qs)
}

----


In [402]:
liwcs <- liwcs[names(liwcs) %in% names(mthetas)]
docuscope_data <- docuscope_data[names(docuscope_data) %in% names(mthetas)]
integ_complx <- integ_complx[names(integ_complx) %in% names(mthetas)]
stm_settings <- stm_settings[names(stm_settings) %in% names(mthetas)]

### MThetas

In [403]:
to_merge <- c()
all_to_remove <- c()
all_yr_parts <- c()
for(q in seq(mthetas)) {
    q_yr_names <- select(df[sapply(df,is.character)], ends_with(names(mthetas)[q])) %>% names

    yr_parts <- c(0)
    for(m in seq(q_yr_names)) {
        yr_parts %<>% c(yr_parts[length(yr_parts)] + length(na.omit(df[[q_yr_names[m]]])))
    }
    all_yr_parts %<>% c(list(yr_parts))

    df_qs <- df %>% select(q_yr_names[1], key) %>% na.omit() %>% rename(var=q_yr_names[1])
    for(o in 2:length(q_yr_names)) {
        df_qs %<>% bind_rows(df %>% select(q_yr_names[o], key) %>% na.omit() %>% rename(var=q_yr_names[o]))
    }
    c_rsps <- c() # combined responses (w key) (unique unless same response between years)
    for(o in seq(nrow(df_qs))) {
       c_rsps %<>% c(paste0(df_qs[[o,2]], '_ _', df_qs[[o,1]]))
    }

    r_rsps <- c()
    for(o in seq(nrow(questions[[names(mthetas)[q]]]))) {
       r_rsps %<>% c(paste0(questions[[names(mthetas)[q]]][[o,2]], '_ _', questions[[names(mthetas)[q]]][[o,1]]))
    }
    #which(c_rsps %in% c_rsps[!(c_rsps %in% r_rsps)])
    to_remove <- which(c_rsps %in% c_rsps[!(c_rsps %in% r_rsps)])

    # inspect missing data
    #temp <- questions[[names(mthetas)[q]]]
    #names(temp) <- names(df_qs)
    #setdiff(df_qs, temp)
    #---
    # in bound rows df of barriers, but not in questions (exact same code used to make both) (updated df since?)
    ## df_qs[!(df_qs$var %in% questions[[names(mthetas)[q]]][[1]]),] %>% rownames %>% as.numeric
    ## doesnt account for key

    if(length(to_remove) > 0) {
        yr_parts %<>% update_parts(to_remove)
        df_qs %<>% remove_idxs(to_remove)
    }

    #---
    i <- stm_settings[[q]]$i
    lower <- stm_settings[[q]]$lowerThresh
    customstops <- stm_settings[[q]]$customStops
    procs <- textProcessor(documents = questions[[i]][[1]], 
                            metadata = questions[[i]][2],
                            verbose=F,
                            customstopwords = customstops
                           )
    docs <- prepDocuments(documents = procs$documents,
                          vocab = procs$vocab,
                          meta = procs$meta,
                          lower.thresh = lower,
                         verbose=F)
    
    if(q > 3) {# after 3, no overlapping responses (and probs ran into on 4)
        procs$docs.removed <- which(!(df_qs$key %in% procs$meta$key))
    }
    
    if(length(procs$docs.removed) > 0) {
        yr_parts %<>% update_parts(procs$docs.removed)
        df_qs %<>% remove_idxs(procs$docs.removed)
    }
    
    if(q > 3)
        docs$docs.removed <- which(!(df_qs$key %in% docs$meta$key))
    
    if(length(docs$docs.removed) > 0) {
        yr_parts %<>% update_parts(docs$docs.removed)
        df_qs %<>% remove_idxs(docs$docs.removed)
    }
    
    if (setdiff(df_qs$key, mthetas[[q]]$key) %>% length != 0 ||
    nrow(df_qs) != nrow(mthetas[[q]]) || yr_parts[length(yr_parts)] != nrow(df_qs)) {
        print('error!')
        print(q)
        break
    }

    if(q <= 3) {
        df_qs <- bind_cols(df_qs %>% select(-var), mthetas[[q]] %>% select(-key))
    } else {
        df_qs <- merge(df_qs %>% select(-var), mthetas[[q]], by='key', all=T)
    }

    for(i in seq(q_yr_names)) {
        temp <- df_qs[(yr_parts[i]+1):yr_parts[i+1], ]
        if (!(q == 3 & i == 1))
            names(temp) %<>% sapply(function(x)(paste0(substr(q_yr_names[i],1,3), x)))
        names(temp)[1] <- 'key'
        rownames(temp) <- c()
        to_merge %<>% c(list(temp))
        #df <- merge(df, temp, by='key', all=T) # messes up subsequent q/mtheta loop iterations
    }                
    all_to_remove %<>% c(list((c(list(to_remove),
                             list(procs$docs.removed),
                             list(docs$docs.removed)))))  
}

----
### liwcs

In [404]:
# renaming
for(i in seq(liwcs)) {
    names(liwcs[[i]]) <- names(liwcs[[i]]) %>% gsub('[.]','', x=.) %>% 
                                                gsub("&", 'and', x=.) %>% 
                                                gsub("/", '', x=.) %>%
                                                gsub("'", '', x=.) %>% 
                                                gsub("  ", ' ', x=.) %>% 
                                                gsub(" ", '_', x=.)
    for(j in seq(liwcs[[i]])) {
        names(liwcs[[i]])[j] <- paste(c(names(liwcs)[i], 'LIWC', 
                                                  names(liwcs[[i]])[j]), collapse = '..')
    }
}
for(i in seq(liwcs)) {
    names(liwcs[[i]])[1] <- 'key'
}

In [405]:
for(q in seq(liwcs)) {
    q_yr_names <- select(df[sapply(df,is.character)], ends_with(names(liwcs)[q])) %>% names

    df_qs <- df %>% select(q_yr_names[1], key) %>% na.omit() %>% rename(var=q_yr_names[1])
    for(o in 2:length(q_yr_names)) {
        df_qs %<>% bind_rows(df %>% select(q_yr_names[o], key) %>% na.omit() %>% rename(var=q_yr_names[o]))
    }
    
    yr_parts <- all_yr_parts[[q]]
    to_remove <- all_to_remove[[q]][[1]]
    if(length(to_remove) > 0) {
        yr_parts %<>% update_parts(to_remove)
        df_qs %<>% remove_idxs(to_remove)
    }
    
    if (setdiff(df_qs$key, liwcs[[q]]$key) %>% length != 0 ||
    nrow(df_qs) != nrow(liwcs[[q]]) || yr_parts[length(yr_parts)] != nrow(df_qs)) {
        print('error!')
        print(q)
        break
    }

    if(q <= 3) {
        df_qs <- bind_cols(df_qs %>% select(-var), liwcs[[q]] %>% select(-key))
    } else {
        df_qs <- merge(df_qs %>% select(-var), liwcs[[q]], by='key', all=T)
    }

    for(i in seq(q_yr_names)) {
        temp <- df_qs[(yr_parts[i]+1):yr_parts[i+1], ]
        if (!(q == 3 & i == 1))
            names(temp) %<>% sapply(function(x)(paste0(substr(q_yr_names[i],1,3), x)))
        names(temp)[1] <- 'key'
        rownames(temp) <- c()
        to_merge %<>% c(list(temp))
        #df <- merge(df, temp, by='key', all=T) # messes up subsequent q/mtheta loop iterations
    }                
}

----
### docuscope_data

In [406]:
for(i in seq(docuscope_data)) {
    names(docuscope_data[[i]]) <- names(docuscope_data[[i]]) %>% gsub('[.]','', x=.) %>% 
                                                gsub("&", 'and', x=.) %>% 
                                                gsub("/", '', x=.) %>%
                                                gsub("'", '', x=.) %>% 
                                                gsub("  ", ' ', x=.) %>% 
                                                gsub(" ", '_', x=.)
    for(j in seq(docuscope_data[[i]])) {
        names(docuscope_data[[i]])[j] <- paste(c(names(docuscope_data)[i], 'docuscope', 
                                                  names(docuscope_data[[i]])[j]), collapse = '..')
    }
}
for(i in seq(docuscope_data)) {
    names(docuscope_data[[i]])[1] <- 'key'
}

In [407]:
for(q in seq(docuscope_data)) {
    q_yr_names <- select(df[sapply(df,is.character)], ends_with(names(docuscope_data)[q])) %>% names

    df_qs <- df %>% select(q_yr_names[1], key) %>% na.omit() %>% rename(var=q_yr_names[1])
    for(o in 2:length(q_yr_names)) {
        df_qs %<>% bind_rows(df %>% select(q_yr_names[o], key) %>% na.omit() %>% rename(var=q_yr_names[o]))
    }
    
    yr_parts <- all_yr_parts[[q]]
    to_remove <- all_to_remove[[q]][[1]]
    if(length(to_remove) > 0) {
        yr_parts %<>% update_parts(to_remove)
        df_qs %<>% remove_idxs(to_remove)
    }
    
    if (setdiff(df_qs$key, docuscope_data[[q]]$key) %>% length != 0 ||
    nrow(df_qs) != nrow(docuscope_data[[q]]) || yr_parts[length(yr_parts)] != nrow(df_qs)) {
        print('error!')
        print(q)
        break
    }

    if(q <= 3) {
        df_qs <- bind_cols(df_qs %>% select(-var), docuscope_data[[q]] %>% select(-key))
    } else {
        df_qs <- merge(df_qs %>% select(-var), docuscope_data[[q]], by='key', all=T)
    }

    for(i in seq(q_yr_names)) {
        temp <- df_qs[(yr_parts[i]+1):yr_parts[i+1], ]
        if (!(q == 3 & i == 1))
            names(temp) %<>% sapply(function(x)(paste0(substr(q_yr_names[i],1,3), x)))
        names(temp)[1] <- 'key'
        rownames(temp) <- c()
        to_merge %<>% c(list(temp))
        #df <- merge(df, temp, by='key', all=T) # messes up subsequent q/mtheta loop iterations
    }                
}

----
### AIC

In [408]:
for(i in seq(integ_complx)) {
    names(integ_complx[[i]]) <- names(integ_complx[[i]]) %>% gsub('[.]','', x=.) %>% 
                                                gsub("&", 'and', x=.) %>% 
                                                gsub("/", '', x=.) %>%
                                                gsub("'", '', x=.) %>% 
                                                gsub("  ", ' ', x=.) %>% 
                                                gsub(" ", '_', x=.)
    for(j in seq(integ_complx[[i]])) {
        names(integ_complx[[i]])[j] <- paste(c(names(integ_complx)[i], 'AIC', 
                                                  names(integ_complx[[i]])[j]), collapse = '..')
    }
}
for(i in seq(integ_complx)) {
    names(integ_complx[[i]])[1] <- 'key'
}

In [409]:
for(q in seq(integ_complx)) {
    q_yr_names <- select(df[sapply(df,is.character)], ends_with(names(integ_complx)[q])) %>% names

    df_qs <- df %>% select(q_yr_names[1], key) %>% na.omit() %>% rename(var=q_yr_names[1])
    for(o in 2:length(q_yr_names)) {
        df_qs %<>% bind_rows(df %>% select(q_yr_names[o], key) %>% na.omit() %>% rename(var=q_yr_names[o]))
    }
    
    yr_parts <- all_yr_parts[[q]]
    to_remove <- all_to_remove[[q]][[1]]
    
    if(q == 1) # not sure whats going on here, but I checked the data and it aligns
        to_remove <- which(df_qs$key %in% setdiff(df_qs$key, integ_complx[[q]]$key))
    
    if(length(to_remove) > 0) {
        yr_parts %<>% update_parts(to_remove)
        df_qs %<>% remove_idxs(to_remove)
    }
    
    
    if (setdiff(df_qs$key, integ_complx[[q]]$key) %>% length != 0 ||
        nrow(df_qs) != nrow(integ_complx[[q]]) || yr_parts[length(yr_parts)] != nrow(df_qs)) {
        print('error!')
        print(q)
        break
    }
    
    if(q <= 3) {
        df_qs <- bind_cols(df_qs %>% select(-var), integ_complx[[q]] %>% select(-key))
    } else {
        df_qs <- merge(df_qs %>% select(-var), integ_complx[[q]], by='key', all=T)
    }

    for(i in seq(q_yr_names)) {
        temp <- df_qs[(yr_parts[i]+1):yr_parts[i+1], ]
        if (!(q == 3 & i == 1))
            names(temp) %<>% sapply(function(x)(paste0(substr(q_yr_names[i],1,3), x)))
        names(temp)[1] <- 'key'
        rownames(temp) <- c()
        to_merge %<>% c(list(temp))
        #df <- merge(df, temp, by='key', all=T) # messes up subsequent q/mtheta loop iterations
    }                
}

# Merging back into df

In [410]:
merged_df <- df
for(i in seq(to_merge)) {
    temp_row <- merged_df %>% nrow
    merged_df <- merge(merged_df, to_merge[[i]], by='key', all=T)
    if (merged_df %>% nrow > temp_row) {
        print(i)
        break
    }
}

In [412]:
nearly_df <- read.csv('../data/nearly_complete_artsengagement.csv')

In [413]:
merged_df <- merge(merged_df, select(nearly_df, setdiff(names(nearly_df), names(df))[-1], key), by='key', all=T)

In [441]:
merged_df <- merged_df[order(match(merged_df$key, df$key)),]

In [447]:
rownames(merged_df) <- c()

In [449]:
df <- merged_df

In [450]:
write.csv(df, file='../data/merged_artsengagement.csv')

In [451]:
save(df, file='../data/merged_artsengagement.rda')