In [26]:
from damply import dirs
from collections import defaultdict
from rich import print
import pandas as pd
import tqdm
gdsqdir = dirs.RAWDATA / 'GDSC-Squared' / 'treatmentResponse'
files = list(gdsqdir.rglob('*.csv'))
print(files)



In [27]:
df_dict = defaultdict(dict)
for f in tqdm.tqdm(files, desc="Processing files"):
    df_dict[str(f.parent.name)].update({
        f.name: pd.read_csv(f),
    })
    tqdm.tqdm.write(f"Loaded: {f.name}")

Processing files:   8%|▊         | 1/12 [00:00<00:02,  5.30it/s]

Loaded: colon_anchor_combo.csv


Processing files:  25%|██▌       | 3/12 [00:00<00:02,  4.25it/s]

Loaded: breast_anchor_combo.csv
Loaded: pancreas_anchor_combo.csv


Processing files:  33%|███▎      | 4/12 [00:13<00:42,  5.34s/it]

Loaded: Original_screen_All_tissues_raw_data.csv


Processing files:  42%|████▏     | 5/12 [00:14<00:26,  3.81s/it]

Loaded: Original_screen_All_tissues_day1_data.csv


Processing files:  50%|█████     | 6/12 [00:15<00:16,  2.69s/it]

Loaded: Validation_screen_All_tissues_day1_data.csv


Processing files:  67%|██████▋   | 8/12 [00:17<00:06,  1.71s/it]

Loaded: Validation_screen_All_tissues_raw_data.csv
Loaded: gdsc-008_matrix_results.csv


Processing files:  92%|█████████▏| 11/12 [00:17<00:00,  1.37it/s]

Loaded: gdsc-010_matrix_results.csv
Loaded: sandpiper-01_matrix_results.csv
Loaded: gdsc-007_matrix_results.csv


Processing files: 100%|██████████| 12/12 [00:18<00:00,  1.50s/it]

Loaded: gdsc-009_matrix_results.csv





# Original Screen

In [None]:
original_screen = df_dict['original_screen']
print({k: v.shape for k, v in original_screen.items()})

# there are two columns in two of the 4 files 

In [54]:
from dataclasses import dataclass, field
from pandas.api.types import infer_dtype
import numpy as np
@dataclass
class ColInfo:
    name: str
    type: str
    nunique: int
    first: str
    last: str

@dataclass
class DataFrameSummary:
    file: str
    shape: tuple
    # inferred type, nunique
    colinfo: list[ColInfo]

summaries = []
for fname, df in original_screen.items():
    summary = DataFrameSummary(
        file=fname,
        shape=df.shape,
        colinfo=[
            ColInfo(
                name=col,
                type=infer_dtype(df[col]),
                nunique=df[col].nunique(),
                first=df[col].iloc[0],
                last=df[col].iloc[-1],
            )
            for col in sorted(df.columns)
        ]
    )
    summaries.append(summary)
print(summaries)

In [53]:

common_cols = set.intersection(*[set(df.columns) for df in original_screen.values()])
print(f'There are {len(common_cols)} common columns: {common_cols}')
# see which files have uncommon columns
for fname, df in original_screen.items():
    uncommon_cols = set(df.columns) - common_cols
    if uncommon_cols:
        print(f'{fname} has uncommon columns: {uncommon_cols}')
    else:
        pass

In [58]:
df = original_screen['Original_screen_All_tissues_raw_data.csv']

# check if any entire column is empty
if any(df[col].isnull().all() for col in df.columns):
    print(f'There are empty columns in {fname}')



In [59]:
df.head()


Unnamed: 0,BARCODE,RESEARCH_PROJECT,SCAN_ID,DATE_CREATED,SCAN_DATE,CELL_ID,MASTER_CELL_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,SEEDING_DENSITY,DRUGSET_ID,ASSAY,DURATION,POSITION,TAG,DRUG_ID,DRUG_NAME,CONC,INTENSITY
0,14482,GDSC_Breast,14077,2016-03-30T23:00:00Z,2016-04-03T23:00:00Z,6190,1317,907047,HCC1806,SIDM00875,325,264,Glo,4,1,UN-USED,,,,0
1,14482,GDSC_Breast,14077,2016-03-30T23:00:00Z,2016-04-03T23:00:00Z,6190,1317,907047,HCC1806,SIDM00875,325,264,Glo,4,2,UN-USED,,,,0
2,14482,GDSC_Breast,14077,2016-03-30T23:00:00Z,2016-04-03T23:00:00Z,6190,1317,907047,HCC1806,SIDM00875,325,264,Glo,4,3,UN-USED,,,,764
3,14482,GDSC_Breast,14077,2016-03-30T23:00:00Z,2016-04-03T23:00:00Z,6190,1317,907047,HCC1806,SIDM00875,325,264,Glo,4,4,UN-USED,,,,382
4,14482,GDSC_Breast,14077,2016-03-30T23:00:00Z,2016-04-03T23:00:00Z,6190,1317,907047,HCC1806,SIDM00875,325,264,Glo,4,5,UN-USED,,,,287


# Anchor

In [61]:
anchor = df_dict['anchor']

anchorSummaries = []
for fname, df in anchor.items():
    summary = DataFrameSummary(
        file=fname,
        shape=df.shape,
        colinfo=[
            ColInfo(
                name=col,
                type=infer_dtype(df[col]),
                nunique=df[col].nunique(),
                first=df[col].iloc[0],
                last=df[col].iloc[-1],
            )
            for col in sorted(df.columns)
        ]
    )
    anchorSummaries.append(summary)
print(anchorSummaries)

# Matrix

In [62]:
matrix = df_dict['matrix']
matrixSummaries = []
for fname, df in matrix.items():
    summary = DataFrameSummary(
        file=fname,
        shape=df.shape,
        colinfo=[
            ColInfo(
                name=col,
                type=infer_dtype(df[col]),
                nunique=df[col].nunique(),
                first=df[col].iloc[0],
                last=df[col].iloc[-1],
            )
            for col in sorted(df.columns)
        ]
    )
    matrixSummaries.append(summary)
print(matrixSummaries)