In [None]:
# Issue with column order of outputs
# Issues working and sorting lists of mixed types

In [1]:
# Import packages/modules
import pandas as pd
import os

In [2]:
def find_delim(in_file, verbose=False):
    '''
    Inspects input file or string for some known delimiter (e.g. ',','\t' etc.)
    and returns the returns the delimiter used to separate enteries in the
    input file or string.

    Arguments:
        in_file (file or string): Input file or string. If the input file does not exist, it is assumed to be a string.
        verbose (boolean): Enable verbose output
    Returns:
        delim (string): The delmiter used in the input file or string.
    '''

    if os.path.exists(in_file):
        with open(in_file, 'r') as file:
            lines = file.readlines()

            if '.tsv' in in_file:
                delim = "\t"
                if verbose:
                    print("Input file is a tsv")
            elif '.csv' in in_file:
                delim = ","
                if verbose:
                    print("Input file is a csv")
            elif ',' in lines[0]:
                delim = ","
                if verbose:
                    print("Input file is a csv")
            elif '\t' in lines[0]:
                delim = "\t"
                if verbose:
                    print("Input file is a tsv")
            elif '\n' in lines[0]:
                delim = "\n"
                if verbose:
                    print("Input file uses newline separators exclusively")
            else:
                delim = " "
                # print("Unrecognized delimiter from input file")

            file.close()
    else:
        if "," in in_file:
            delim = ","
        elif "\t" in in_file:
            delim = "\t"
        elif ":" in in_file:
            delim = ":"
        elif ";" in in_file:
            delim = ";"
        else:
            delim = " "
            # print("Unrecognized delimiter from input string")

    return delim

In [3]:
def mk_df(in_file, verbose=False):
    '''
    Creates a (pandas) dataframe from an input file.

    Arguments:
        in_file (file): Input file.
        verbose (boolean): Enable verbose output
    Returns:
        df (dataframe): Output dataframe
    '''

    delim = find_delim(in_file=in_file, verbose=verbose)
    df = pd.read_csv(in_file, sep=delim)

    col_names = list(df.columns)
    df.sort_values(by=col_names[0], axis=0, inplace=True, ascending=True, kind='quicksort')

    return df

In [4]:
def rm_sub(df, rm_list):
    '''
    Removes a list of subjects from some dataframe. Note that this
    change is done in-place.

    Arguments:
        df (dataframe): Input dataframe.
        rm_list (list): List of subjects to remove from dataframe
    Returns:
        df (dataframe): Output dataframe with subjects in list removed
    '''

    col_names = list(df.columns)

    if len(rm_list) != 0:
        for r in rm_list:
            df.drop(df[df[col_names[0]] == r].index, inplace=True)

    return df

In [53]:
def keep_columns(df, kp_list=[], rm_nan=True):
    '''
    Creates an output dataframe that contains only the specified column
    indices (e.g. numerical index starting at 0 and not the column names).
    The output dataframe will also drop subjects that contain NaNs for all
    covariates of interest.

    Arguments:
        df (dataframe): Input dataframe.
        kp_list (list): List of df column name indices to be kept. The indices should follow that of the input TSV or CSV file.
        rm_nan (boolean): Drops subjects that contain NaNs
    Returns:
        df_2 (dataframe): Output dataframe with only the selected columns remaining.
    '''

    # Create column list
    col_names = list(df.columns)

    # Init keep-cols list
    kp_cols = list()
    kp_cols.insert(0, col_names[0])

    # Check for empty input list
    if len(kp_list) == 0:
        kp_list = range(1, len(col_names), 1)

    # Create list of columns to keep
    for idx in kp_list:
        kp_cols.append(col_names[idx])

    # Create new dataframe
    df_2 = df[kp_cols].copy()

    # Exclude subjects with NaNs
    if rm_nan:
        df_2.dropna(subset=kp_cols, inplace=True)

    return df_2

In [45]:
def subs_retain(df, subs_keep=[]):
    '''
    Creates a copy of dataframe that only includes a list of subjects.

    Arguments:
        df (dataframe): Input dataframe.
        subs_keep (list): List of subjects to create the dataframe from
    Returns:
        df_keep (dataframe): Output dataframe with subjects in list
    '''

    col_names = list(df.columns)
    df_keep = pd.DataFrame({col_names[0]: []})

    if len(subs_keep) != 0:
        for sub in subs_keep:
            df_keep = df_keep.append(df.loc[df[col_names[0]] == sub], sort=False)
    else:
        df_keep = df

    return df_keep

In [7]:
def mk_adj_sub_list(df_all,df_subs,rm_list=[]):
    '''
    Creates an adjusted subject inclusion (keep_list) and exclusion (rm_list)
    lists using some input dataframe. The input rm_list is updated to
    reflect subjects removed from the design matrix as a result of
    manual exclusion or missing data.

    Arguments:
        df_all (dataframe): Input dataframe of all subjects.
        df_subs (dataframe): Input dataframe.
    Returns:
        rm_list_adj (list): Adjusted rm_list that lists all excluded subjects.
        keep_list_adj (list): Adjusted keep_list that list all the included subjects.
    '''
    
    # Create list of column names
    col_names = list(df_all.columns)
    
    # Create list from input dataframes subject IDs
    all_subs = df_all[col_names[0]].to_list()
    keep_subs = df_subs[col_names[0]].to_list()
    
    # Create sets from lists
    all_subs_set = set(all_subs)
    subs_keep_set = set(keep_subs)
    
    # Create removed subject list set
    rm_list_set = all_subs_set.difference(subs_keep_set)

    # Create updated/adjusted lists
    rm_list_adj = list(rm_list_set)
    rm_list_adj.extend(rm_list)
    keep_list_adj = list(subs_keep_set)
    
    # Sort lists
    rm_list_adj.sort()
    keep_list_adj.sort()

    return rm_list_adj, keep_list_adj

In [8]:
def list_to_file(in_list, out_file):
    '''
    Writes some input list to some file.

    Arguments:
        in_list (list): List of subjects.
        out_file (file): Output filename.
    Returns:
        out_file (file): Output file.
    '''

    # Write list to file
    with open(out_file, "w") as f:
        for sub in in_list:
            f.write("%s\n" % sub)
        f.close()

    return out_file

In [9]:
def file_to_list(file):
    '''
    Reads a file into a list, assuming the file is separated by newline
    characters.

    Arguments:
        file (file): Input file to be read.
    Returns:
        lines (list): List from input file.
    '''

    # Read file into list
    with open(file, "r") as f:
        lines = f.read().splitlines()
        f.close()

    lines.sort()

    return lines

In [28]:
def parse_str_list(string):
    '''
    Parses a file or string into a list.

    Arguments:
        string (file or string): Input file or string to be read.
    Returns:
        sub_list (list): List of subjects from file or string.
    '''

    delim = find_delim(in_file=string)
    in_list = string.split(sep=delim)
    in_list.sort()

    if len(in_list) == 1:
        if os.path.exists(string):
            sub_list = file_to_list(file=string)
        else:
            sub_list = in_list
    elif len(in_list) > 1:
        sub_list = in_list
    elif len(string) == 0:
        sub_list = list()

    return sub_list

In [11]:
def write_design(df, out_file, sep=" "):
    '''
    Writes an output design matrix from an input dataframe. Output
    values for floats will be written with three decimal places of
    floating point precision.

    Arguments:
        df (dataframe): Input dataframe
        out_file (file): Output filename
        sep (string): Separator
    Returns:
        out_file (file): Output design
    '''

    # Create column list
    col_names = list(df.columns)
    out_cols = list()

    # Create secondary dataframe without subject ID column
    for idx in range(1, len(col_names), 1):
        out_cols.append(col_names[idx])

    df_out = df[out_cols].copy()

    # df_out.to_csv(out_file,sep=sep,header=False,index=False,na_rep="NaN",float_format='%g')
    df_out.to_csv(out_file, sep=sep, header=False, index=False, na_rep="NaN", float_format='%.3f')

    # return out_file, df_out
    return out_file

In [12]:
def demean_col(df, col_indices=[]):
    '''
    Demeans column indices of a dataframe. NOTE: The column or columns
    can only contain numeric values. Non-numeric values will cause
    errors, and exceptions to be thrown.

    Arguments
        df (dataframe): Input dataframe
        col_indices (list): List of column numerical indices to demean
    Returns
        df_demean (dataframe): Output dataframe with demeaned columns from the input list
    '''

    # Create column list
    col_names = list(df.columns)

    # Copy dataframe
    df_demean = df.copy()

    for i in col_indices:
        df_demean[col_names[i]] = df_demean[col_names[i]].sub(df_demean[col_names[i]].mean())

    return df_demean

In [36]:
f1 = "X:/IRC317H/BIDS/scripts/designs/mk_design/jpy.notebooks/test.mats/design.master.NAS.csv"
ret = "X:/IRC317H/BIDS/scripts/designs/design_mat/rs_fmri/include.list.txt"
cols = "3,4"
pref = "test.1"

In [37]:
in_file=f1
prefix=pref
rm_list=""
ret_list=ret
kp_col_list=cols
demean_ind=""
rm_nan=True
sep=" "

In [38]:
df_init = mk_df(in_file=in_file)
df_init

Unnamed: 0,participant_id,"Sex (0=F,1=M)",PMA,NAS,CTL
28,C01,1.0,44.428571,0,1
29,C02,0.0,44.571429,0,1
30,C03,1.0,45.142857,0,1
31,C04,1.0,45.0,0,1
32,C07,0.0,42.142857,0,1
33,C08,0.0,42.142857,0,1
34,C09,0.0,44.142857,0,1
35,C10,0.0,45.571429,0,1
36,C11,1.0,45.714286,0,1
37,C13,1.0,44.0,0,1


In [39]:
if len(rm_list) > 0:
    rm_list = parse_str_list(string=rm_list)
    # rm_list = [int(i) for i in rm_list]
else:
    rm_list = list()
    
if len(ret_list) > 0:
    ret_list = parse_str_list(string=ret_list)
    # ret_list = [int(i) for i in ret_list]
else:
    ret_list = list()
    
if len(kp_col_list) > 0:
    kp_col_list = parse_str_list(string=kp_col_list)
    kp_col_list = [int(i) for i in kp_col_list]

In [41]:
rm_list

[]

In [42]:
ret_list

['C01',
 'C02',
 'C03',
 'C04',
 'C07',
 'C08',
 'C09',
 'C10',
 'C11',
 'C13',
 'C15',
 'C17',
 'C19',
 'C20',
 'C22',
 'C24',
 'C27',
 'C28',
 'C29',
 'C30',
 'C32',
 'C33',
 'C34',
 'C35',
 'C36',
 'C37',
 'C38',
 'P01',
 'P02',
 'P03',
 'P05',
 'P06',
 'P07',
 'P08',
 'P09',
 'P10',
 'P11',
 'P12',
 'P14',
 'P15',
 'P16',
 'P18',
 'P19',
 'P21',
 'P22',
 'P23',
 'P24',
 'P26',
 'P27',
 'P28',
 'P30']

In [43]:
kp_col_list

[3, 4]

In [54]:
df_keep = subs_retain(df=df_init, subs_keep=ret_list)
df_keep

Unnamed: 0,participant_id,"Sex (0=F,1=M)",PMA,NAS,CTL
28,C01,1.0,44.428571,0.0,1.0
29,C02,0.0,44.571429,0.0,1.0
30,C03,1.0,45.142857,0.0,1.0
31,C04,1.0,45.0,0.0,1.0
32,C07,0.0,42.142857,0.0,1.0
33,C08,0.0,42.142857,0.0,1.0
34,C09,0.0,44.142857,0.0,1.0
35,C10,0.0,45.571429,0.0,1.0
36,C11,1.0,45.714286,0.0,1.0
37,C13,1.0,44.0,0.0,1.0


In [55]:
df_rm = rm_sub(df=df_keep, rm_list=rm_list)
df_rm

Unnamed: 0,participant_id,"Sex (0=F,1=M)",PMA,NAS,CTL
28,C01,1.0,44.428571,0.0,1.0
29,C02,0.0,44.571429,0.0,1.0
30,C03,1.0,45.142857,0.0,1.0
31,C04,1.0,45.0,0.0,1.0
32,C07,0.0,42.142857,0.0,1.0
33,C08,0.0,42.142857,0.0,1.0
34,C09,0.0,44.142857,0.0,1.0
35,C10,0.0,45.571429,0.0,1.0
36,C11,1.0,45.714286,0.0,1.0
37,C13,1.0,44.0,0.0,1.0


In [56]:
df_cols = keep_columns(df=df_rm, kp_list=kp_col_list, rm_nan=rm_nan)
df_cols

Unnamed: 0,participant_id,NAS,CTL
28,C01,0.0,1.0
29,C02,0.0,1.0
30,C03,0.0,1.0
31,C04,0.0,1.0
32,C07,0.0,1.0
33,C08,0.0,1.0
34,C09,0.0,1.0
35,C10,0.0,1.0
36,C11,0.0,1.0
37,C13,0.0,1.0


In [57]:
if len(demean_ind) > 0:
    demean_ind = parse_str_list(string=demean_ind)
    demean_ind = [int(i) for i in demean_ind]
    df_demean = demean_col(df=df_cols, col_indices=demean_ind)
    df = df_demean
else:
    df = df_cols

In [58]:
df

Unnamed: 0,participant_id,NAS,CTL
28,C01,0.0,1.0
29,C02,0.0,1.0
30,C03,0.0,1.0
31,C04,0.0,1.0
32,C07,0.0,1.0
33,C08,0.0,1.0
34,C09,0.0,1.0
35,C10,0.0,1.0
36,C11,0.0,1.0
37,C13,0.0,1.0


In [59]:
[rm_list, ret_list] = mk_adj_sub_list(df_all=df_init,df_subs=df,rm_list=rm_list)