In [16]:
import pandas as pd
import numpy as np

def unionMatrix(level, meanFill = False):
    """
    Labels will be:
    0 - patient is healthy
    1 - patient has T2D
    2 - patient has IBD
    3 - patient has CAD
    4 - patient has CKD
    NOTE: meanFill seems to be useless, because all bacteria, which aren't in all disease-excels
    seem to be always 0 for all patients, so meanFill = True will produce the same result
    as meanFill = False, which is faster to calculate.
    :param level: Level, from which the union is created e.g. 'Class' or 'Species'
    :param meanFill: meanFill is now useless, after an intersection of bacteria should be used not
    the union
    :return: A pandas dataframe of all patients and bacteria
    """
    dataframes = []
    diseases = ['T2D', 'IBD', 'CAD', 'CKD']
    for i in diseases:
        df = pd.read_csv('../HackathonMicrobiomeData/'+ i +'/' + level + i + '_train.csv')
        dataframes.append(df)

    unionHeader = []
    unionSamples = []

    for df in dataframes:
        unionHeader = set(unionHeader) | set(df.columns.values)
        unionSamples = set(unionSamples) | set(df.iloc[:,0])

    intersection_header = unionHeader
    for df in dataframes:
        intersection_header = set(intersection_header) & set(df.columns.values)

    tmp = []
    intersection_header.remove('sample_ID')
    intersection_header.remove('label')
    intersection_header = list(intersection_header)
    intersection_header.sort()
    unionSamples = list(unionSamples)
    unionSamples.sort()
    i = 0
    for sample in unionSamples:
        # Add sample as first element in the list
        tmp.append([sample])
        for header in intersection_header:
            data_found = False
            label = 0
            j = 1
            for df in dataframes:
                cell = []
                if header in df.columns:
                    row = df[(df.sample_ID == sample)]
                    cell = row[header].values
                    label_tmp = row['label'].values
                    if len(cell) != 0:
                        tmp[i].append(cell[0])
                        label = label_tmp
                        if label != 0:
                            label = [j];
                        data_found = True
                        break
                j = j + 1
            if not data_found:
                if meanFill:
                    for df in dataframes:
                        if header in df.columns:
                            df_tmp = df[(df.label == 0)]
                            h_mean = np.mean(df_tmp[header])
                            tmp[i].append(h_mean)
                            break
                else:
                    tmp[i].append(0)
        tmp[i].append(label[0])
        i = i + 1


    intersection_header.insert(0, 'sample_ID')
    intersection_header.append('label')
    df_union = pd.DataFrame(data=tmp, columns=intersection_header)
    df_union.to_csv('unionMatrix_' + level + '.csv', index=False)

    return df_union


unionMatrix('Class', False)

Unnamed: 0,sample_ID,Bacteria;Abditibacteriota;Abditibacteria,Bacteria;Acidobacteriota;Aminicenantia,Bacteria;Acidobacteriota;Blastocatellia,Bacteria;Actinobacteriota;Acidimicrobiia,Bacteria;Actinobacteriota;Actinobacteria,Bacteria;Actinobacteriota;Coriobacteriia,Bacteria;Actinobacteriota;Thermoleophilia,Bacteria;Bacteroidota;Bacteroidia,Bacteria;Bdellovibrionota;Bdellovibrionia,...,Bacteria;Nitrospirota;Nitrospiria,Bacteria;Patescibacteria;Saccharimonadia,Bacteria;Proteobacteria;Alphaproteobacteria,Bacteria;Proteobacteria;Gammaproteobacteria,Bacteria;Spirochaetota;Brachyspirae,Bacteria;Spirochaetota;Spirochaetia,Bacteria;Synergistota;Synergistia,Bacteria;Verrucomicrobiota;Lentisphaeria,Bacteria;Verrucomicrobiota;Verrucomicrobiae,label
0,H18090,0,0,0,0,0,133,0,7017,0,...,0,1,0,197,0,0,0,0,0,1
1,H18093,0,0,0,0,10,336,0,6872,0,...,0,0,0,1676,0,0,0,0,17,0
2,H18094,0,0,0,0,8,501,0,7906,0,...,0,0,0,629,0,0,0,0,0,0
3,H18095,0,0,0,0,1,108,0,6404,0,...,0,0,0,272,0,0,0,0,0,0
4,H18096,0,0,0,0,3,262,0,5717,0,...,0,5,15,143,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1284,H34151,0,0,0,0,58,888,0,1906,0,...,0,6,15,722,0,0,0,0,0,2
1285,H34153,0,0,0,0,30,185,0,11490,0,...,0,5,0,1749,0,0,0,0,5,1
1286,H34155,0,0,0,0,8,198,0,3507,0,...,0,0,0,825,0,0,0,0,0,1
1287,H34165,0,0,0,0,10,188,0,3481,0,...,0,0,0,6073,0,0,0,0,0,1
