In [1]:
import pandas as pd
import os

In [2]:
def find_delim(in_file, verbose=False):
    '''
    Inspects input file or string for some known delimiter (e.g. ',','\t' etc.)
    and returns the returns the delimiter used to separate enteries in the
    input file or string.

    Arguments:
        in_file (file or string): Input file or string. If the input file does not exist, it is assumed to be a string.
        verbose (boolean): Enable verbose output
    Returns:
        delim (string): The delmiter used in the input file or string.
    '''

    if os.path.exists(in_file):
        with open(in_file, 'r') as file:
            lines = file.readlines()

            if '.tsv' in in_file:
                delim = "\t"
                if verbose:
                    print("Input file is a tsv")
            elif '.csv' in in_file:
                delim = ","
                if verbose:
                    print("Input file is a csv")
            elif ',' in lines[0]:
                delim = ","
                if verbose:
                    print("Input file is a csv")
            elif '\t' in lines[0]:
                delim = "\t"
                if verbose:
                    print("Input file is a tsv")
            elif '\n' in lines[0]:
                delim = "\n"
                if verbose:
                    print("Input file uses newline separators exclusively")
            else:
                delim = " "
                # print("Unrecognized delimiter from input file")

            file.close()
    else:
        if "," in in_file:
            delim = ","
        elif "\t" in in_file:
            delim = "\t"
        elif ":" in in_file:
            delim = ":"
        elif ";" in in_file:
            delim = ";"
        else:
            delim = " "
            # print("Unrecognized delimiter from input string")

    return delim

In [3]:
def mk_df(in_file, verbose=False):
    '''
    Creates a (pandas) dataframe from an input file.

    Arguments:
        in_file (file): Input file.
        verbose (boolean): Enable verbose output
    Returns:
        df (dataframe): Output dataframe
    '''

    delim = find_delim(in_file=in_file, verbose=verbose)
    df = pd.read_csv(in_file, sep=delim)

    col_names = list(df.columns)
    df.sort_values(by=col_names[0], axis=0, inplace=True, ascending=True, kind='quicksort')

    return df

In [4]:
def rm_sub(df, rm_list):
    '''
    Removes a list of subjects from some dataframe. Note that this
    change is done in-place.

    Arguments:
        df (dataframe): Input dataframe.
        rm_list (list): List of subjects to remove from dataframe
    Returns:
        df (dataframe): Output dataframe with subjects in list removed
    '''

    col_names = list(df.columns)

    if len(rm_list) != 0:
        for r in rm_list:
            df.drop(df[df[col_names[0]] == r].index, inplace=True)

    return df

In [5]:
def keep_columns(df, kp_list=[], rm_nan=True):
    '''
    Creates an output dataframe that contains only the specified column
    indices (e.g. numerical index starting at 0 and not the column names).
    The output dataframe will also drop subjects that contain NaNs for all
    covariates of interest.

    Arguments:
        df (dataframe): Input dataframe.
        kp_list (list): List of df column name indices to be kept. The indices should follow that of the input TSV or CSV file.
        rm_nan (boolean): Drops subjects that contain NaNs
    Returns:
        df_2 (dataframe): Output dataframe with only the selected columns remaining.
    '''

    # Create column list
    col_names = list(df.columns)

    # Init keep-cols list
    kp_cols = list()
    kp_cols.insert(0, col_names[0])

    # Check for empty input list
    if len(kp_list) == 0:
        kp_list = range(1, len(col_names), 1)

    # Create list of columns to keep
    for idx in kp_list:
        kp_cols.append(col_names[idx])

    # Create new dataframe
    df_2 = df[kp_cols].copy()

    # Exclude subjects with NaNs
    if rm_nan:
        df_2.dropna(subset=kp_cols, inplace=True)

    return df_2

In [6]:
def subs_retain(df, subs_keep=[]):
    '''
    Creates a copy of dataframe that only includes a list of subjects.

    Arguments:
        df (dataframe): Input dataframe.
        subs_keep (list): List of subjects to create the dataframe from
    Returns:
        df_keep (dataframe): Output dataframe with subjects in list
    '''

    col_names = list(df.columns)
    df_keep = pd.DataFrame({col_names[0]: []})

    if len(subs_keep) != 0:
        for sub in subs_keep:
            df_keep = df_keep.append(df.loc[df[col_names[0]] == sub], sort=False)
    else:
        df_keep = df

    return df_keep

In [7]:
def mk_adj_sub_list(df_all,df_subs,rm_list=[]):
    '''
    Creates an adjusted subject inclusion (keep_list) and exclusion (rm_list)
    lists using some input dataframe. The input rm_list is updated to
    reflect subjects removed from the design matrix as a result of
    manual exclusion or missing data.

    Arguments:
        df_all (dataframe): Input dataframe of all subjects.
        df_subs (dataframe): Input dataframe.
        rm_list (list): Input list of removed subjects to be updated.
    Returns:
        rm_list_adj (list): Adjusted rm_list that lists all excluded subjects.
        keep_list_adj (list): Adjusted keep_list that list all the included subjects.
    '''
    
    # Create list of column names
    col_names = list(df_all.columns)
    
    # Create list from input dataframes subject IDs
    all_subs = df_all[col_names[0]].to_list()
    keep_subs = df_subs[col_names[0]].to_list()
    
    # Create sets from lists
    all_subs_set = set(all_subs)
    subs_keep_set = set(keep_subs)
    
    # Create removed subject list set
    rm_list_set = all_subs_set.difference(subs_keep_set)

    # Create updated/adjusted lists
    rm_list_adj = list(rm_list_set)
    rm_list_adj.extend(rm_list)
    keep_list_adj = list(subs_keep_set)
    
    # Sort lists
    rm_list_adj.sort()
    keep_list_adj.sort()

    return rm_list_adj, keep_list_adj

In [8]:
def list_to_file(in_list, out_file):
    '''
    Writes some input list to some file.

    Arguments:
        in_list (list): List of subjects.
        out_file (file): Output filename.
    Returns:
        out_file (file): Output file.
    '''

    # Write list to file
    with open(out_file, "w") as f:
        for sub in in_list:
            f.write("%s\n" % sub)
        f.close()

    return out_file

In [9]:
def file_to_list(file):
    '''
    Reads a file into a list, assuming the file is separated by newline
    characters.

    Arguments:
        file (file): Input file to be read.
    Returns:
        lines (list): List from input file.
    '''

    # Read file into list
    with open(file, "r") as f:
        lines = f.read().splitlines()
        f.close()

    lines.sort()

    return lines

In [10]:
def parse_str_list(string):
    '''
    Parses a file or string into a list.

    Arguments:
        string (file or string): Input file or string to be read.
    Returns:
        sub_list (list): List of subjects from file or string.
    '''

    delim = find_delim(in_file=string)
    in_list = string.split(sep=delim)
    in_list.sort()

    if len(in_list) == 1:
        if os.path.exists(string):
            sub_list = file_to_list(file=string)
        else:
            sub_list = in_list
    elif len(in_list) > 1:
        sub_list = in_list

    return sub_list

In [11]:
def write_design(df, out_file, sep=" "):
    '''
    Writes an output design matrix from an input dataframe. Output
    values for floats will be written with three decimal places of
    floating point precision.

    Arguments:
        df (dataframe): Input dataframe
        out_file (file): Output filename
        sep (string): Separator
    Returns:
        out_file (file): Output design
    '''

    # Create column list
    col_names = list(df.columns)
    out_cols = list()

    # Create secondary dataframe without subject ID column
    for idx in range(1, len(col_names), 1):
        out_cols.append(col_names[idx])

    df_out = df[out_cols].copy()

    # df_out.to_csv(out_file,sep=sep,header=False,index=False,na_rep="NaN",float_format='%g')
    df_out.to_csv(out_file, sep=sep, header=False, index=False, na_rep="NaN", float_format='%.3f')

    # return out_file, df_out
    return out_file

In [43]:
def new_list_index(df_1,df_2,num_ind):
    '''
    Constructs new list of dataframe column indices given a list of current indices
    for some input dataframe, and another dataframe with fewer columns with the same 
    headers/name
    
    Arguments:
        df_1 (df): Input dataframe with column header/name 
        df_2 (df): Input dataframe with fewer columns than df_1, but with columns of the same name
        num_ind (list): List of column numbers/indices to be included in the new list of indices.
    Returns:
        col_ind (list): List of column numbers/indices that correspond to df_2.
    '''
    # Column indices list
    col_ind = []
    
    # Create column list
    col_names_1 = list(df_1.columns)
    col_names_2 = list(df_2.columns)
    
    for i in num_ind:
        idx = col_names_1[i]
        col_ind.append(col_names_2.index(idx))
    return col_ind

In [12]:
def demean_col(df, col_indices=[]):
    '''
    Demeans column indices of a dataframe. NOTE: The column or columns
    can only contain numeric values. Non-numeric values will cause
    errors, and exceptions to be thrown.

    Arguments
        df (dataframe): Input dataframe
        col_indices (list): List of column numerical indices to demean
    Returns
        df_demean (dataframe): Output dataframe with demeaned columns from the input list
    '''

    # Create column list
    col_names = list(df.columns)

    # Copy dataframe
    df_demean = df.copy()

    for i in col_indices:
        df_demean[col_names[i]] = df_demean[col_names[i]].sub(df_demean[col_names[i]].mean())

    return df_demean

In [44]:
def mk_design(in_file, prefix, rm_list="", ret_list="", kp_col_list="", demean_ind="", rm_nan=True, sep=" "):
    '''
    Writes output design matrix in addition to inclusion and exclusion lists
    for the given input file (which could be a TSV or CSV). The output design
    matrix is written without headers, row indices, and subject IDs. The input
    file must contain the subject IDs in the first column.

    Arguments:
        in_file (file): Input file with header titles, subject IDs, and covariates
        prefix (string): Output file prefix.
        rm_list (file or string): File or comma separated strings of subjects to remove.
        ret_list (file or string): File or comma separated strings of subjects to retain.
        kp_col_list (file or string): File or comma separated strings of column indices to retain in design matrix (e.g. "1,2,3", index count starts at 0).
        demean_ind (file or string): File or comma separated strings of column indices to demean (NOTE: column cannot contain non-numeric values)
        rm_nan (boolean): Remove subjects with NaNs in the specified covariates (from kp_col_list) from the design matrix.
        sep (string): Separator string to use, valid separators/delimitors include: "," and "\t".
    Returns:
        out_mat (file): Output design matrix
        out_rm (file): Subject exclusion file
        out_keep (file): Subject inclusion file
    '''

    # Create initial dataframe
    df_init = mk_df(in_file=in_file)

    # Create input lists from input strings
    if len(rm_list) > 0:
        rm_list = parse_str_list(string=rm_list)
        rm_list.sort()
    if len(ret_list) > 0:
        ret_list = parse_str_list(string=ret_list)
        ret_list.sort()
    if len(kp_col_list) > 0:
        kp_col_list = parse_str_list(string=kp_col_list)
        kp_col_list = [int(i) for i in kp_col_list]
        kp_col_list.sort()

    # Initialize dataframe
    df_keep = subs_retain(df=df_init, subs_keep=ret_list)
    df_rm = rm_sub(df=df_keep, rm_list=rm_list)
    df_cols = keep_columns(df=df_rm, kp_list=kp_col_list, rm_nan=rm_nan)

    # Demean data if required
    if len(demean_ind) > 0:
        demean_ind = parse_str_list(string=demean_ind)
        demean_ind = [int(i) for i in demean_ind]
        demean_ind = new_list_index(df_1=df_init, df_2=df_cols, num_ind=demean_ind)
        df_demean = demean_col(df=df_cols, col_indices=demean_ind)
        df = df_demean
    else:
        df = df_cols

    # Update inclusion and exclusion lists
    [rm_list, ret_list] = mk_adj_sub_list(df_all=df_init,df_subs=df_cols,rm_list=rm_list)

    # Write output files
    out_mat = prefix + ".txt"
    out_rm = prefix + ".exclude.txt"
    out_keep = prefix + ".include.txt"

    out_mat = write_design(df=df, out_file=out_mat, sep=sep)
    out_rm = list_to_file(in_list=rm_list, out_file=out_rm)
    out_keep = list_to_file(in_list=ret_list, out_file=out_keep)

    # Print out complete design with header information
    df.to_csv(prefix + ".all_info.txt", sep=sep, header=True, index=False, na_rep="NaN", float_format='%.3f')

    return out_mat, out_rm, out_keep

In [48]:
# Test files
in_file="master.template.design.csv"
prefix="test.1.cov"
rm_list="sub-1013,sub-1515,sub-1569"
ret_list=""
kp_col_list="1,2,3,8,12"
demean_ind="2"
# demean_ind="2,12"
rm_nan=True
sep=","

In [49]:
mk_design(in_file, prefix, rm_list, ret_list, kp_col_list, demean_ind, rm_nan, sep)

('test.1.cov.txt', 'test.1.cov.exclude.txt', 'test.1.cov.include.txt')

## UPDATE: Make column order arguments consistent (demean columns option)

In [17]:
# Create initial dataframe
df_init = mk_df(in_file=in_file)

# Create input lists from input strings,
# use list comprehension to convert strings to integers
if len(rm_list) > 0:
    rm_list = parse_str_list(string=rm_list)
    rm_list.sort()
if len(ret_list) > 0:
    ret_list = parse_str_list(string=ret_list)
    ret_list.sort()
if len(kp_col_list) > 0:
    kp_col_list = parse_str_list(string=kp_col_list)
    kp_col_list = [int(i) for i in kp_col_list]
    kp_col_list.sort()

In [18]:
# Initialize dataframe
df_keep = subs_retain(df=df_init, subs_keep=ret_list)
df_rm = rm_sub(df=df_keep, rm_list=rm_list)
df_cols = keep_columns(df=df_rm, kp_list=kp_col_list, rm_nan=rm_nan)

In [19]:
df_init

Unnamed: 0,participant_id,intercept,age,sex,control,sct_low,sct_high,parent_sct,parent_adhd,teacher_sct,teacher_adhd,child_sct,child_sct_cci-2,parent_CABI,teacher_CABI
22,sub-1003,1,11,1,0,1,0,2.7500,2.8889,1.8750,4.222222,0.50,,,
23,sub-1017,1,10,1,0,1,0,1.1250,1.7778,2.1250,2.111111,1.40,,,
45,sub-1023,1,13,0,0,0,1,3.3750,3.2222,4.0000,2.444444,1.50,,,
46,sub-1035,1,11,0,0,0,1,3.1250,1.8889,0.6250,1.000000,0.20,,,
24,sub-1039,1,13,0,0,1,0,3.5550,3.7770,0.3330,0.555000,0.80,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,sub-1691,1,12,1,1,0,0,0.3750,1.1111,0.2500,2.000000,0.80,1.4000,0.0000,0.000000
64,sub-1692,1,11,0,0,0,1,3.8888,4.3333,3.3333,3.333300,1.20,1.2000,3.4666,4.333300
65,sub-1695,1,11,0,0,0,1,3.0000,3.5556,1.3750,0.333333,0.25,0.4667,2.4000,1.533333
44,sub-1696,1,8,1,0,1,0,2.0000,1.7778,0.1250,1.000000,0.30,1.0000,0.6667,0.000000


In [20]:
df_cols

Unnamed: 0,participant_id,intercept,age,sex,parent_adhd,child_sct_cci-2
27,sub-1065,1,8,1,4.2222,0.7333
28,sub-1066,1,9,1,2.5556,1.0667
29,sub-1067,1,8,0,2.7778,1.8
30,sub-1069,1,10,0,3.5556,1.4667
31,sub-1075,1,10,0,3.8889,1.1333
32,sub-1084,1,11,1,2.8889,0.6667
47,sub-1507,1,13,1,3.4444,1.8667
48,sub-1512,1,8,1,3.4444,0.6
33,sub-1513,1,12,0,0.8889,1.6
50,sub-1519,1,12,1,2.5556,1.3333


In [21]:
df_1 = df_init
df_2 = df_cols

In [26]:
df_1.columns

Index(['participant_id', 'intercept', 'age', 'sex', 'control', 'sct_low',
       'sct_high', 'parent_sct', 'parent_adhd', 'teacher_sct', 'teacher_adhd',
       'child_sct', 'child_sct_cci-2', 'parent_CABI', 'teacher_CABI'],
      dtype='object')

In [27]:
df_2.columns

Index(['participant_id', 'intercept', 'age', 'sex', 'parent_adhd',
       'child_sct_cci-2'],
      dtype='object')

In [28]:
demean_ind

'2'

In [30]:
# Create column list
col_names_1 = list(df_1.columns); col_names_1

['participant_id',
 'intercept',
 'age',
 'sex',
 'control',
 'sct_low',
 'sct_high',
 'parent_sct',
 'parent_adhd',
 'teacher_sct',
 'teacher_adhd',
 'child_sct',
 'child_sct_cci-2',
 'parent_CABI',
 'teacher_CABI']

In [31]:
# Create column list
col_names_2 = list(df_2.columns); col_names_2

['participant_id', 'intercept', 'age', 'sex', 'parent_adhd', 'child_sct_cci-2']

In [34]:
col_names_2.index('participant_id')

0

In [41]:
def new_list_index(df_1,df_2,num_ind):
    '''
    working doc-string
    '''
    # Column indices list
    col_ind = []
    
    # Create column list
    col_names_1 = list(df_1.columns)
    col_names_2 = list(df_2.columns)
    
    for i in num_ind:
        idx = col_names_1[i]
        col_ind.append(col_names_2.index(idx))
    return col_ind

In [42]:
new_list_index(df_1,df_2,[2,8,12])

['age', 'parent_adhd', 'child_sct_cci-2']