### File Preparation

In [1]:
#set working directory
import os

os.chdir("../Output_run1/UnformattedLatexTables")
cwd = os.getcwd()

newpath = r'../FormattedLatexTables' 
if not os.path.exists(newpath):
    os.makedirs(newpath)

#picks out .tex files in working directory
allFiles = os.listdir(cwd)

texFiles = [f for f in allFiles if f.startswith("dfMain2") and f.endswith(".tex")]
#print(texFiles) #uncomment to see file list

#set covariate list to compile

covariates = ["nNoun", "nVerb", "nAdj", "nAdv", "Conc", "SemD", "SubCD", "MemCD",
              "logWF", "nLet", "nSyll", "pld20", "old20", "posUni", "posBi"]

### Simple Regression

In [2]:
#compile a .tex file each for Acc and RT
#takes in individual .tex files for each simple regression and compile together 
#adds additional formatting: greys out rows with insignificant effects; "header" for each covariate; add 1 colum to left

#current output needs to be split into multiple pages manually as needed, or converted into longtable


AccSimple = [f for f in texFiles if "Acc_" in f and not "+" in f] #accuracy files
RTSimple = [f for f in texFiles if "RT_" in f and not "+" in f] #reaction time files

def compileSimple(fileList, covList, pg): #takes list of files for dependent variable, list of covariates to include
    if pg == 0:
        pgNo = ""
    else:
        pgNo = str(pg)
    
    #get beginning of file names
    filePrefix = fileList[0].split('_')[0] 
    
    with open('../FormattedLatexTables/'+filePrefix+'_simpleCollated'+pgNo+'.tex',"w") as collatedFile:
        collatedFile.write('\\begin{table}[H]\n')
        collatedFile.write('\\begingroup\\normalsize\n')
        collatedFile.write('\hspace*{-1.5cm}\n')
        collatedFile.write('\\begin{tabular}{=p{0cm}lllr+l+r+r+r+r+r+l}\n')
        collatedFile.write('\hline\n')
        
        for cov in covList:
            collatedFile.write('\multicolumn{11}{l}{\\textit{Predictor: ' + cov +'}}\\\\\n')
            
            fName = [f for f in fileList if cov in f][0] #assumes that there is only one occurrence in list
            with open(fName) as covFile:
                writeLine = False
                
                for line in covFile:
                    if line.startswith("\end{tabular}"): #stop writing lines from covFile
                        writeLine = False
                        collatedFile.write("\hline\n")
                        
                    if writeLine == True:
                        if "&   \\" in line[-10:-1]:
                            collatedFile.write('\\rowstyle{\leavevmode\color{gray}}\n') #makes lines with no significance grey
                        if not "\hline" in line:
                            collatedFile.write("&  " + line.replace("NA", "")) #needs to be formated with an extra column
                    
                    if line.startswith("Task & Dataset"): #start writing lines from covFile
                        writeLine = True
                            
                        
            covFile.close()
                

        collatedFile.write('\end{tabular}\n')
        collatedFile.write('\endgroup\n')
        collatedFile.write('\end{table}\n')
        
    collatedFile.close()
        
#run compile function
compileSimple(RTSimple, covariates, 0)
compileSimple(AccSimple, covariates, 0)

In [3]:
#run compile function to generate two covariates per page
i = 0
pgcount = 1
while i < len(covariates):
    compileSimple(RTSimple, covariates[i:i+2], pgcount)
    compileSimple(AccSimple, covariates[i:i+2], pgcount)
    i += 2
    pgcount +=1

### Multiple Regression

In [4]:
#replace results for AP datasets with models exclude nAdv from regression
#replace any occurrence of 999 or 999.00 with ""; replace any occurence of NA with ""
#adds additional formatting: greys out rows with insignificant effects; "header" for each task; remove task column
#current output needs to be split into multiple pages manually as needed, or converted into longtable

AccMulti = [f for f in texFiles if "Acc_" in f and "+" in f and not "RC" in f] #accuracy files
RTMulti = [f for f in texFiles if "RT_" in f and "+" in f and not "RC" in f] #reaction time files


def formatMulti(fileList):
    #get beginning of file names
    filePrefix = fileList[0].split('_')[0] 
    
    shortFile = [f for f in fileList if not "nAdv" in f][0] #get AP datasets from this file
    fullFile = [f for f in fileList if "nAdv" in f][0] #get megastudies datasets from this file
    
    with open('../FormattedLatexTables/'+filePrefix+'_multiple.tex',"w") as collatedFile:
        collatedFile.write('\\begin{table}[H]\n')
        collatedFile.write('\\begingroup\\normalsize\n')
        collatedFile.write('\\begin{tabular}{=p{0cm}llr+l+r+r+r+r+r+l}\n')
        collatedFile.write('\hline\n')
        collatedFile.write(' & Dataset & df & adj.r.sq & Predictor & b & SE & VIF & t & p &  \\\\ \n')
                 
        with open(shortFile) as APFile:
            writeLine = False
                
            for line in APFile:

                splitLine = line.split("&")

                if len(splitLine) > 1:

                    if "AP\_full\_e" in splitLine[1]: #start writing lines from APFile
                        writeLine = True
                    
                    if "BLP" in splitLine[1]: #stop writing lines from APFile
                        writeLine = False
                        break
                
                if writeLine == True: 
                    if splitLine[0].strip().isalpha(): #i.e. there is a task heading
                        collatedFile.write('\multicolumn{10}{l}{\\textit{Task: ' + splitLine[0] +'}}\\\\\n')
                    if "&   \\" in line[-10:-1]:
                        collatedFile.write('\\rowstyle{\leavevmode\color{gray}}\n') #makes lines with no significance grey
                    collatedFile.write("& " + line[line.find('&')+1:].replace("NA", "").replace("999.00", "").replace("999", "")) #write line from second column onwards
                    
                #elif writeLine == False:
                #    break
             
        APFile.close()

            
        with open(fullFile) as megaFile:
            writeLine = False
                
            for line in megaFile:
                
                splitLine = line.split("&")
                    
                if len(splitLine) > 1:                
                    if "BLP" in splitLine[1]: #start writing lines from megaFile
                        writeLine = True
                    if "hline" in line: #stop writing lines from megaFile
                        writeLine = False
                        break
                
                if writeLine == True: 
                    if splitLine[0].strip().isalpha(): #i.e. there is a task heading
                        collatedFile.write('\multicolumn{10}{l}{\\textit{Task: ' + splitLine[0] +'}}\\\\\n')
                    if "&   \\" in line[-10:-1]:
                        collatedFile.write('\\rowstyle{\leavevmode\color{gray}}\n') #makes lines with no significance grey
                        
                    collatedFile.write("& " + line[line.find('&')+1:].replace("NA", "").replace("999.00", "").replace("999", "")) #write line from second column onwards
                    
                        
        megaFile.close()
                

        collatedFile.write('\\hline\n')
        collatedFile.write('\end{tabular}\n')
        collatedFile.write('\endgroup\n')
        collatedFile.write('\end{table}\n')
        
    collatedFile.close()
        
#run compile function
formatMulti(RTMulti)
formatMulti(AccMulti)

### PCA Loadings

In [13]:
#reads loadings from PCAloadings_dfMain2.txt and prints to .tex
#assumes four components in the order of Lexical, Semantic, Sublexical and Part-of-Speech

#covariates = ["nNoun", "nVerb", "nAdj", "nAdv", "logWF", "nLet", "nSyll", "pld20", "old20", "posUni",\
#              "posBi", "SemD", "Conc", "SubCD", "MemCD"]
import re

covLongest = max([len(cov) for cov in covariates])

with open('../FormattedLatexTables/PCAloadings_rotated_dfMain2.tex',"w") as loadingsFile:
    loadingsFile.write('\\begin{table}[H]\n')
    loadingsFile.write('\\centering\n')
    loadingsFile.write('\\caption{Loadings of principal component analysis on covariates with cutoff at .3} \\label{tab:dfMain2loadings}\n')
    loadingsFile.write('\\begingroup\\normalsize\n')
    loadingsFile.write('\\begin{tabular}{lrrrr}\n')
    loadingsFile.write('\hline\n')
    loadingsFile.write('  & Component1 & Component2 & Component3 & Component 4\\\\\n')
    loadingsFile.write('  & (Lexical) & (Semantic) & (Sublexical) & (Part-of-Speech)\\\\\n')
    
    with open('../PCA/PCAloadings_rotated_dfMain2.txt') as pcaFile:
        writeLine = False
            
        for line in pcaFile:
                
            if "loadings with cutoff at 0.3:" in line: #take loadings from under this heading
                writeLine = True
                
            if writeLine == True:
                if "RC" in line:
                    col_start = [m.start()-1 for m in re.finditer('RC', line)] #find character in line where column starts
                if line[0:covLongest].strip() in covariates:
                    delim_line = list(line) #find index to add "&" to deliminate columns
                    for ind in col_start:
                        delim_line[ind] = "&"
                    loadingsFile.write(''.join(delim_line) + '\\\\\n')
                        
        pcaFile.close()
                
    loadingsFile.write('\hline\n')
    loadingsFile.write('\end{tabular}\n')
    loadingsFile.write('\endgroup\n')
    loadingsFile.write('\end{table}\n')
        
loadingsFile.close()


## PCA Regressions

In [6]:
#replace any occurence of NA with ""
#adds additional formatting: greys out rows with insignificant effects; "header" for each task; remove task column
#current output needs to be split into multiple pages manually as needed, or converted into longtable

AccPCA = [f for f in texFiles if "Acc_" in f and "+" in f and "RC" in f] #accuracy files
RTPCA = [f for f in texFiles if "RT_" in f and "+" in f and "RC" in f] #reaction time files


def formatPCA(fileList):
    filePrefix = fileList[0].split('_')[0]
    with open('../FormattedLatexTables/'+filePrefix+'_PCAmulti.tex',"w") as pcaformattedFile:
        pcaformattedFile.write('\\begin{table}[H]\n')
        pcaformattedFile.write('\\begingroup\\normalsize\n')
        pcaformattedFile.write('\\begin{tabular}{=p{0cm}llr+l+r+r+r+r+r+l}\n')
        pcaformattedFile.write('\hline\n')
        pcaformattedFile.write(' & Dataset & df & adj.r.sq & Predictor & b & SE & VIF & t & p &  \\\\ \n')
        pcaformattedFile.write('\hline\n')
                 
        with open(AccPCA[0]) as pcaFile:
            writeLine = False
                
            for line in pcaFile:
                if "\hline" in line:
                    pcaformattedFile.write('\hline\n')
                    writeLine = False

                splitLine = line.split("&")

                if len(splitLine) > 1:

                    if "AP\_full\_e" in splitLine[1]: #start writing lines from APFile
                        writeLine = True
                    
                
                if writeLine == True: 

                    if splitLine[0].strip().isalpha(): #i.e. there is a task heading
                        pcaformattedFile.write('\multicolumn{10}{l}{\\textit{Task: ' + splitLine[0] +'}}\\\\\n')
                    if "&   \\" in line[-10:-1]:
                        pcaformattedFile.write('\\rowstyle{\leavevmode\color{gray}}\n') #makes lines with no significance grey
                    pcaformattedFile.write("& " + line[line.find('&')+1:].replace("NA", "").replace("999.00", "").replace("999", "")) #write line from second column onwards

        pcaFile.close()


        pcaformattedFile.write('\end{tabular}\n')
        pcaformattedFile.write('\endgroup\n')
        pcaformattedFile.write('\end{table}\n')
        
    pcaformattedFile.close()
        
#run compile function
formatPCA(RTPCA)
formatPCA(AccPCA)

## Summary Table

In [14]:
#grab regression b and p values from collated excel table dfMain2_allCovariates.csv and write to reference sheet on summaryTable.xlsx
#mapping of PCA to covariate loadings manually set according to PCA loadings output (winner takes all)
import pandas as pd
import numpy as np

allModels = pd.read_csv('../ExcelTables/dfMain2_allCovariates.csv', header = None)

RC_cov_map = pd.DataFrame(columns=["component", "covariate"])
RC_cov_map.covariate = covariates
RC_cov_map.component = ["RC4", "RC2", "RC4", "RC4", "RC4", "RC2", "RC2", "RC2", "RC2", 
                        "RC1", "RC1", "RC1", "RC1", "RC3", "RC3"]

models = ['AP_full_e', 'AP_deg_h', 'BLP', 'ELP_LD', 'AELP', 'MALD', 'ELP_NMG', 'SDP']

interactions  = ["NOS:" + cov for cov in covariates]+["NOS:RC1", "NOS:RC2", "NOS:RC3", "NOS:RC4"]

summary_cols = ['AP_full_e_simple', 'AP_full_e_multi', 'AP_full_e_pca',
                'AP_deg_h_simple', 'AP_deg_h_multi', 'AP_deg_h_pca',
                'BLP_simple', 'BLP_multi', 'BLP_pca',
                'ELP_LD_simple', 'ELP_LD_multi', 'ELP_LD_pca',
                'AELP_simple', 'AELP_multi', 'AELP_pca',
                'MALD_simple', 'MALD_multi', 'MALD_pca',
                'ELP_NMG_simple', 'ELP_NMG_multi', 'ELP_NMG_pca',
                'SDP_simple', 'SDP_multi', 'SDP_pca']


RTb = pd.DataFrame(columns=summary_cols, index=pd.Index(covariates))
Accb = pd.DataFrame(columns=summary_cols, index=pd.Index(covariates))

RTp = pd.DataFrame(columns=summary_cols, index=pd.Index(covariates))
Accp = pd.DataFrame(columns=summary_cols, index=pd.Index(covariates))


AccModels = allModels.iloc[:, 0:10]
RTModels = allModels.iloc[:, np.r_[0:3, 11:18]]

AccModels.columns = AccModels.iloc[2]
RTModels.columns = RTModels.iloc[2]

AccModels['Dataset'].fillna(method='ffill', inplace = True)
RTModels['Dataset'].fillna(method='ffill', inplace = True)

AccModels['Task'].fillna("", inplace = True)
RTModels['Task'].fillna("", inplace = True)


modelstarts = RTModels.loc[RTModels['Task'].str.contains("~ NOS"), 'Task']
modelstarts_df = pd.DataFrame({'rownum':modelstarts.index, 'formula':modelstarts, 'rtype':""})
modelstarts_df.loc[modelstarts_df['formula'].str.contains("RC"), 'rtype'] = "pca"
modelstarts_df.loc[modelstarts_df['formula'].str.contains("nAdv"), 'rtype'] = "multi"
modelstarts_df.loc[~(modelstarts_df['formula'].str.contains("\+")), 'rtype'] = "simple"
modelstarts_df.loc[modelstarts_df['rtype'] == "", 'rtype'] = "multi_AP"
modelstarts_df.reset_index(drop=True)



#slice out every model by modelstarts_df.rownum, get rows containing interaction, 
#fill in b and p by corresponding dataset, covariate, and rtype

for i in range(len(modelstarts_df)):
    startidx = modelstarts_df.iloc[i].rownum + 3
    if i < len(modelstarts_df)-1:
        stopidx = modelstarts_df.iloc[i+1].rownum
    else: 
        stopidx = len(AccModels)
            
    modelslice_Acc = AccModels[startidx:stopidx]
    modelint_Acc = modelslice_Acc.loc[modelslice_Acc['Predictor'].isin(interactions)]
    
    modelslice_RT = RTModels[startidx:stopidx]
    modelint_RT = modelslice_RT.loc[modelslice_RT['Predictor'].isin(interactions)]

    #print(modelstarts_df.iloc[i].rtype)
    if modelstarts_df.iloc[i].rtype == 'multi_AP':
        modelint_Acc = modelint_Acc.loc[modelint_Acc['Dataset'].str.contains("AP")]
        modelint_RT = modelint_RT.loc[modelint_RT['Dataset'].str.contains("AP")]
    if modelstarts_df.iloc[i].rtype == 'multi':
        modelint_Acc = modelint_Acc.loc[~modelint_Acc['Dataset'].str.contains("AP")]
        modelint_RT = modelint_RT.loc[~modelint_RT['Dataset'].str.contains("AP")]
        
    for index, row in modelint_Acc.iterrows():
        targetcol = row['Dataset'] + "_" + modelstarts_df.iloc[i].rtype.replace("_AP", "")
        targetrow = row['Predictor'].replace("NOS:","")
        if modelstarts_df.iloc[i].rtype == 'pca':
            targetrow = RC_cov_map.loc[RC_cov_map.component == targetrow, "covariate"]
            Accb.loc[Accb.index.isin(targetrow), targetcol] = row['b']
            Accp.loc[Accp.index.isin(targetrow), targetcol] = row['p']
        else:
            Accb.loc[targetrow, targetcol] = row['b']
            Accp.loc[targetrow, targetcol] = row['p']
    

    for index, row in modelint_RT.iterrows():
        targetcol = row['Dataset'] + "_" + modelstarts_df.iloc[i].rtype.replace("_AP", "")
        targetrow = row['Predictor'].replace("NOS:","")
        if modelstarts_df.iloc[i].rtype == 'pca':
            targetrow = RC_cov_map.loc[RC_cov_map.component == targetrow, "covariate"]
            RTb.loc[RTb.index.isin(targetrow), targetcol] = row['b']
            RTp.loc[RTp.index.isin(targetrow), targetcol] = row['p']
        else:
            RTb.loc[targetrow, targetcol] = row['b']
            RTp.loc[targetrow, targetcol] = row['p']

#write to excel            
            
newrow_pHeader = pd.DataFrame([[None for _ in range(len(summary_cols)+1)],])
newrow_pHeader[0] = "Interaction p-value"
newrow_bHeader = pd.DataFrame([[None for _ in range(len(summary_cols)+1)],])
newrow_bHeader[0] = "Interaction slope"

writer = pd.ExcelWriter("../ExcelTables/summaryTable_values.xlsx", mode = 'w', engine='openpyxl')
newrow_pHeader.to_excel(writer,sheet_name = "Accuracy", index=False, header=False)
Accp.to_excel(writer,sheet_name = "Accuracy", startrow = 2)
newrow_bHeader.to_excel(writer,sheet_name = "Accuracy", index=False, header=False, startrow = 20)
Accb.to_excel(writer,sheet_name = "Accuracy", startrow = 22)
writer.save()
writer.close()

writer = pd.ExcelWriter("../ExcelTables/summaryTable_values.xlsx", mode = 'a', engine='openpyxl')
newrow_pHeader.to_excel(writer,sheet_name = "ReactionTime", index=False, header=False)
RTp.to_excel(writer,sheet_name = "ReactionTime", startrow = 2)
newrow_bHeader.to_excel(writer,sheet_name = "ReactionTime", index=False, header=False, startrow = 20)
RTb.to_excel(writer,sheet_name = "ReactionTime", startrow = 22)
writer.save()
writer.close()

## Descriptives

In [8]:
#Converts text descriptives of original dataset to sideways table -- swap out header and footer

with open('../FormattedLatexTables/Descriptives_original.tex',"w") as formatteddescFile:
    formatteddescFile.write('\\begin{sidewaystable}[ph!]\n')
    formatteddescFile.write('\\begin{tabular}{p{2.9cm}p{1.8cm}p{3.8cm}p{5.8cm}p{1.15cm}p{1.7cm}p{1.15cm}p{1.6cm}}\n')
    formatteddescFile.write('\\hline\n')
    
    with open('../Descriptives/Descriptives_original.tex') as descFile:
        writeLine = False
            
        for line in descFile:
                
            if "Dataset" in line: #take loadings from under this heading
                writeLine = True
                
            if writeLine == True:
                    formatteddescFile.write(line)
                    
                    if 'hline' in line:
                        writeLine == False
                        
                        
        descFile.close()
                
    formatteddescFile.write('\n\end{tabular}\n')
    

    formatteddescFile.write('\\caption{Descriptive statistics of original datasets} \\label{tab:DescriptivesOriginal}\n')
    formatteddescFile.write('\end{sidewaystable}\n')
        
formatteddescFile.close()


In [9]:
#Add hline between datasets, remove repeated information

with open('../FormattedLatexTables/Descriptives_current.tex',"w") as formatteddescFile:
    formatteddescFile.write('\\begin{table}[H]\n')
    formatteddescFile.write('\\begingroup\\normalsize\n')
    formatteddescFile.write('\\hspace*{-1.5cm}\n')
    formatteddescFile.write('\\begin{tabular}{lllllll}\n')
    with open('../Descriptives/Descriptives_currentdfMain2.tex') as descFile:
        writeLine = False
   
        for line in descFile:
                
            if "Dataset" in line: #take loadings from under this heading
                writeLine = True
                
            if writeLine == True:
                    
                if '&' in line:

                    if not line[0: line.index('&')].strip() == lastLine[0: line.index('&')].strip(): #i.e. first occurrence
                        formatteddescFile.write('\hline\n')
                        formatteddescFile.write(line)
                        
                    else: 
                        lineSplit = line.split("&")
                        lineSplit[0:2] = " "
                        formatteddescFile.write("&" + ("&".join(lineSplit)))
                    
                    if '\hline' in line:
                        writeLine == False
                        
            lastLine = line
                        

                        
                        
        descFile.close()
                
    formatteddescFile.write('\hline\n')
    formatteddescFile.write('\end{tabular}\n')
    formatteddescFile.write('\endgroup\n')
    formatteddescFile.write('\end{table}\n')

formatteddescFile.close()