In [1]:
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings

In [2]:
def validate_files(GPL):
    files = glob(f'{GPL}/*.csv')

    ncols = None
    columns = []

    for f in files:

        # just read the columns
        df = pd.read_csv(f, nrows=1)

        # assert that all files have same n cols
        if ncols is None: ncols = df.shape[1]
        else: assert ncols == df.shape[1]

        columns += df.columns.tolist()
        columns = list(set(columns))

    # assert that all columns are the same
    assert len(columns) == ncols

def read_and_cast(path):

    df = pd.read_csv(path)
    # drop the primary key
    df = df.drop(["samples"], axis=1)

    cols = df.columns.tolist()
    cols.remove("type")

    # there are some data entry erros which cause the columns
    # coerce = send these values to nan
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float16)

    # we want tag the rows with the type of cancer
    cancer = path.split("/")[-1].split("_")[0].lower()

    # suppress pandas fragmentation warning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df["cancer"] = cancer

    # drop the missing values
    return df.dropna(axis=0)

def merge_files(GPL):
    files = glob(f'{GPL}/*.csv')
    fname = f"{GPL}/{GPL}.csv"

    # start with first one to get columns
    df = read_and_cast(files[0])
    df.to_csv(fname, index=False)

    # for the rest, skip columns (iloc)
    for f in tqdm(files[1:]):
        df = read_and_cast(f)
        df.to_csv(fname, mode="a", index=False, header=False)

Assumes the directory structure is GPL/*.csv. Writes the merged file to GPL/GPL.csv. After merging files, we run label.ipynb.

In [3]:
GPL = "GPL96"
validate_files(GPL)
merge_files(GPL)

100%|██████████| 7/7 [00:32<00:00,  4.60s/it]
