In [2]:
import os
import numpy as np
import pandas as pd
import slingpy as sp
from sklearn.impute import SimpleImputer
from genedisco.datasets.features.hgnc_names import HGNCNames
from slingpy.data_access.data_sources.hdf5_tools import HDF5Tools
from slingpy.data_access.data_sources.hdf5_data_source import HDF5DataSource
from slingpy.data_access.data_sources.abstract_data_source import AbstractDataSource

class Achilles(object):
    """
    """
    FILE_URL = "https://ndownloader.figshare.com/files/25494359"

    @staticmethod
    def load_data(save_directory) -> AbstractDataSource:
        h5_file = os.path.join(save_directory, "achilles.h5")
        if not os.path.exists(h5_file):
            csv_file_path = os.path.join(save_directory, "achilles.csv")
            if not os.path.exists(csv_file_path):
                sp.download_streamed(Achilles.FILE_URL, csv_file_path)
            df = pd.read_csv(csv_file_path)
            gene_names = list(map(lambda x: x.split(" ")[0], df.columns.values.tolist()[1:]))
            data = df.values[:, 1:].astype(float).transpose()

            si = SimpleImputer(missing_values=float("nan"), strategy='mean')
            data = si.fit_transform(data)

            name_converter = HGNCNames(save_directory)
            gene_names = name_converter.update_outdated_gene_names(gene_names)

            data_df = pd.DataFrame(data)
            data_df.index = gene_names
            data_df = data_df.groupby(data_df.index).mean()
            gene_names, data = data_df.index.values.tolist(), data_df.values.astype(np.float32)

            #gene_names, idx_start = np.unique(sorted(gene_names), return_index=True)
            #data = data[idx_start]
            HDF5Tools.save_h5_file(h5_file,
                                   data,
                                   "achilles",
                                   column_names=df["DepMap_ID"].values.tolist(),
                                   row_names=gene_names)
        data_source = HDF5DataSource(h5_file, fill_missing_value=0)
        return data_source

In [29]:
save_directory = './'

In [30]:
csv_file_path = os.path.join(save_directory, "achilles.csv")
if not os.path.exists(csv_file_path):
    sp.download_streamed(Achilles.FILE_URL, csv_file_path)
df = pd.read_csv(csv_file_path)
gene_names = list(map(lambda x: x.split(" ")[0], df.columns.values.tolist()[1:]))
data = df.values[:, 1:].astype(float).transpose()

si = SimpleImputer(missing_values=float("nan"), strategy='mean')
data = si.fit_transform(data)

name_converter = HGNCNames(save_directory)
gene_names = name_converter.update_outdated_gene_names(gene_names)

data_df = pd.DataFrame(data)
data_df.index = gene_names
data_df = data_df.groupby(data_df.index).mean()
gene_names, data = data_df.index.values.tolist(), data_df.values.astype(np.float32)

#gene_names, idx_start = np.unique(sorted(gene_names), return_index=True)
#data = data[idx_start]

  previous_mapping = self.get_hgnc_mapping(to_id="symbol", from_id=other_id_name)


In [59]:
horlbeck_ground_truth = pd.read_csv('./datasets/ground_truth_Horlbeck.csv')
horlbeck_genes = list(set([x.split('_')[1] for x in horlbeck_ground_truth['combo_name']]))
horlbeck_genes = name_converter.update_outdated_gene_names(horlbeck_genes)
remaining_genes = list(set(horlbeck_genes).intersection(set(gene_names)))

In [139]:
repeated_feats = np.repeat(data_df.values, len(remaining_genes), axis=0)
concatenated_feats = np.vstack([data_df.values] * len(remaining_genes))
concat_achilles = np.concatenate([repeated_feats, concatenated_feats], 1)

In [140]:
repeated_names = np.repeat(remaining_genes, len(remaining_genes), axis=0)
concatenated_names = np.vstack(remaining_genes * len(remaining_genes)).flatten()
concat_achilles_names = [x+'_'+y for x,y, in zip(repeated_names, concatenated_names)]

In [142]:
concat_achilles_df = pd.DataFrame(concat_achilles)
concat_achilles_df.index = concat_achilles_names

In [143]:
concat_achilles_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1606,1607,1608,1609,1610,1611,1612,1613,1614,1615
TIMELESS_TIMELESS,-1.676750,-1.755036,-1.086709,-1.498683,-1.702024,-1.586559,-1.644320,-1.682837,-1.482841,-1.502620,...,-1.652289,-1.591765,-1.685834,-1.577719,-1.560127,-1.605395,-1.336947,-1.421868,-1.481827,-1.445924
TIMELESS_PSMD1,-1.676750,-1.755036,-1.086709,-1.498683,-1.702024,-1.586559,-1.644320,-1.682837,-1.482841,-1.502620,...,-1.237494,-1.589435,-1.395879,-1.361787,-1.041439,-1.183271,-1.648483,-1.214548,-0.981663,-1.006214
TIMELESS_CIT,-1.676750,-1.755036,-1.086709,-1.498683,-1.702024,-1.586559,-1.644320,-1.682837,-1.482841,-1.502620,...,-0.493536,-0.330046,-0.636183,-0.617706,-0.216680,-0.060172,-0.534520,-0.213121,-0.419051,-0.287131
TIMELESS_GTPBP4,-1.676750,-1.755036,-1.086709,-1.498683,-1.702024,-1.586559,-1.644320,-1.682837,-1.482841,-1.502620,...,-0.498954,-0.553504,-0.661295,-0.509407,-0.504673,-0.472142,-0.612440,-0.502645,-0.468759,-0.853013
TIMELESS_PPWD1,-1.676750,-1.755036,-1.086709,-1.498683,-1.702024,-1.586559,-1.644320,-1.682837,-1.482841,-1.502620,...,-1.537695,-1.498974,-1.862613,-1.901116,-1.424200,-1.401349,-0.786863,-1.514930,-1.894839,-1.662051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TBCB_POLA1,-1.445611,-1.160493,-1.633513,-1.465785,-1.817099,-1.219197,-1.612268,-1.400621,-1.837799,-1.120125,...,-0.999466,-0.772550,-0.487581,-1.262614,-0.961971,-1.029421,-1.125234,-0.861272,-1.187816,-0.971920
TBCB_EIF3H,-1.445611,-1.160493,-1.633513,-1.465785,-1.817099,-1.219197,-1.612268,-1.400621,-1.837799,-1.120125,...,-1.020902,-1.045328,-0.646112,-0.900826,-1.109268,-1.190479,-1.533329,-1.232139,-0.776394,-1.359350
TBCB_PTTG1,-1.445611,-1.160493,-1.633513,-1.465785,-1.817099,-1.219197,-1.612268,-1.400621,-1.837799,-1.120125,...,-0.375804,-0.422970,-0.612225,-0.458244,-0.628930,-0.658478,-0.515096,-0.686376,-0.571228,-0.510597
TBCB_NAA50,-1.445611,-1.160493,-1.633513,-1.465785,-1.817099,-1.219197,-1.612268,-1.400621,-1.837799,-1.120125,...,-1.312454,-0.142466,-1.048362,-1.403249,-1.430489,-1.245565,-0.912174,-1.289818,-1.620808,-1.300129
