# Exploration on the annotation to set up the samples of interest for downstream analysis

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
df = pd.read_csv('/mnt/dzl_bioinf/binliu/EMBL_ExpressionArray/arrayExpress_annotation.csv', sep='\t')

In [4]:
df.columns

Index(['Source Name', 'Material Type', 'Characteristics[organism]',
       'Characteristics[cell type]', 'Characteristics[cell line]',
       'Characteristics[tissue supergroup]', 'Characteristics[organism part]',
       'Characteristics[metastatic tissue]', 'Characteristics[disease]',
       'Characteristics[clinical history]', 'Characteristics[compound]',
       'Characteristics[growth condition]', 'Characteristics[stimulus]',
       'Characteristics[genotype]', 'Characteristics[RNAi]',
       'Characteristics[phenotype]', 'Labeled Extract Name', 'Label',
       'Assay Name', 'Technology Type', 'Comment [ArrayExpress accession]',
       'Derived Array Data File', 'Comment [Derived ArrayExpress FTP file]',
       'Factor Value[disease]', 'Factor Value[cell type]',
       'Factor Value[cell line]', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
       'Unnamed: 29'],
      dtype='object')

In [5]:
df.head(3)

Unnamed: 0,Source Name,Material Type,Characteristics[organism],Characteristics[cell type],Characteristics[cell line],Characteristics[tissue supergroup],Characteristics[organism part],Characteristics[metastatic tissue],Characteristics[disease],Characteristics[clinical history],...,Comment [ArrayExpress accession],Derived Array Data File,Comment [Derived ArrayExpress FTP file],Factor Value[disease],Factor Value[cell type],Factor Value[cell line],Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
0,Sample 1,cell,Homo sapiens,lung cancer cell line,RERF-LC-KJ,lung,lung,not applicable,lung cancer,,...,E-GEOD-10843,processedMatrix.Aurora.july2015.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,lung cancer,lung cancer cell line,RERF-LC-KJ,,,,
1,Sample 2,cell,Homo sapiens,ovarian cancer cell line,FU-OV-1,ovary,ovary,not applicable,ovary carcinoma,,...,E-GEOD-10843,processedMatrix.Aurora.july2015.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,ovary carcinoma,ovarian cancer cell line,FU-OV-1,,,,
2,Sample 3,cell,Homo sapiens,colonic cancer cell line,COLO-205,colorectal,colon,ascites,colon adenocarcinoma,,...,E-GEOD-10843,processedMatrix.Aurora.july2015.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,colon adenocarcinoma,colonic cancer cell line,COLO-205,,,,


## Sorting the organ and tissue information

In [11]:
cell_type_sorted = Counter(df['Characteristics[cell type]']).most_common()
cell_type_sorted

[('  ', 13104),
 ('mononuclear cell', 1507),
 ('breast cancer cell line', 673),
 ('epithelial cell', 435),
 ('breast tumor cell', 401),
 ('lymphoblastoid cell line', 395),
 ('hepatocellular carcinoma cell line', 355),
 ('EBV-transformed lymphoblastoid cell line', 348),
 ('airway epithelial cell', 300),
 ('blasts and mononuclear cells', 297),
 ('cervical carcinoma cell line', 247),
 ('leukocyte', 236),
 ('huvec', 223),
 ('B-cell derived cell line', 218),
 ('lung adenocarcinoma cell line', 208),
 ('CD138-selected cells', 201),
 ('neuroblastoma cell line', 194),
 ('colorectal cancer cell line', 190),
 ('fibroblast', 187),
 ('CD138+ plasma cell', 182),
 ('embryonic kidney derived cell line', 173),
 ('endometrial adenocarcinoma cell line', 168),
 ('monocyte', 155),
 ('mesenchymal stem cell', 147),
 ('monocyte-derived dendritic cell', 134),
 ('macrophage', 133),
 ('lymphocyte', 115),
 ('T-ALL cell', 114),
 ('CD138-purified plasma cells', 114),
 ('smooth muscle cell', 101),
 ('dendritic cell'

In [12]:
disease_status_sorted = Counter(df['Characteristics[disease]']).most_common()
disease_status_sorted

[('normal', 9423),
 ('acute myeloid leukaemia', 974),
 ('chronic lymphocytic leukaemia', 757),
 ('breast cancer', 717),
 ('breast adenocarcinoma', 567),
 ('multiple myeloma', 532),
 ('diffuse large B-cell lymphoma', 460),
 ('colorectal adenocarcinoma', 366),
 ('hepatocellular carcinoma', 362),
 ('ovarian carcinoma', 331),
 ('melanoma', 320),
 ('neuroblastoma', 282),
 ('lung adenocarcinoma', 273),
 ('gastric cancer', 260),
 ('Pre-B-ALL/c-ALL', 253),
 ('cervical adenocarcinoma', 238),
 ('multiple sclerosis or clinically isolated syndrome', 237),
 ('asthma', 234),
 ('T-ALL', 230),
 ('periodontitis', 226),
 ('NSCLC', 219),
 ('glioblastoma', 214),
 ('myeloma', 208),
 ('obesity', 205),
 ('no rejection', 203),
 ('myelodysplastic syndrome', 190),
 ('acute myeloid leukaemia ', 181),
 ('colon carcinoma', 179),
 ('endometrial adenocarcinoma', 168),
 ('ALL', 163),
 ('septic shock', 163),
 ('pancreatic cancer', 152),
 ('AML/other abnormalities', 140),
 ('lung cancer', 136),
 ('undifferentiated sarc

In [13]:
Counter(df['Characteristics[cell line]']).most_common()

[('  ', 21656),
 ('MCF-7', 347),
 ('HeLa', 231),
 ('HepG2', 210),
 ('HCT-116', 173),
 ('A549', 167),
 ('MDA-MB-231', 145),
 ('Ishikawa', 144),
 ('Huh7', 118),
 ('HEK-293', 99),
 ('THP-1', 82),
 ('LNCaP', 79),
 ('BEAS-2B', 75),
 ('MCF-10A', 74),
 ('A2780', 67),
 ('MSK-Leuk1', 60),
 ('DU145', 55),
 ('hBEC', 54),
 ('ARPE-19', 49),
 ('U937', 48),
 ('RPMI-8226', 45),
 ('H9', 42),
 ('SH-SY5Y-E', 42),
 ('HT-29', 41),
 ('EREB2-5', 41),
 ('Jurkat', 40),
 ('HEK-293T', 39),
 ('A375', 38),
 ('HT1080', 37),
 ('Caco-2', 37),
 ('U2-OS', 36),
 ('K562', 34),
 ('U251', 33),
 ('SK-N-MC', 33),
 ('HepaRG', 33),
 ('SW620', 32),
 ('NCI-H1299', 26),
 ('BE(2)-C', 26),
 ('H295R', 25),
 ('SH-SY5Y-A', 24),
 ('HK-2', 24),
 ('Sez-4', 24),
 ('SH-SY5Y', 23),
 ('RWPE1', 23),
 ('HL60', 23),
 ('Saos-2', 22),
 ('PC-3', 22),
 ('IMR-32', 22),
 ('IMR90', 22),
 ('MonoMac 6', 21),
 ('SK-N-SH', 21),
 ('P493-6', 21),
 ('BT-474', 19),
 ('Calu-3 subclone 2B4', 19),
 ('HEK-TLR2', 18),
 ('CEM', 17),
 ('SAOS', 16),
 ('T47D', 16),
 (

In [14]:
Counter(df['Characteristics[tissue supergroup]']).most_common()

[('  ', 19200),
 ('lymphoid', 1205),
 ('breast', 784),
 ('colorectal', 750),
 ('lung', 515),
 ('adipose tissue', 450),
 ('liver', 415),
 ('brain', 387),
 ('gastric tissue', 289),
 ('skin', 287),
 ('cervix', 248),
 ('vastus lateralis', 246),
 ('lymphoblastoid', 241),
 ('prostate', 241),
 ('peripheral blood', 202),
 ('endometrium', 181),
 ('embryonic kidney', 174),
 ('uterus', 156),
 ('ovary', 146),
 ('colonic mucosa', 140),
 ('pancreas', 94),
 ('subcutaneous adipose tissue', 82),
 ('lung/bronchus', 75),
 ('pleura', 70),
 ('stomach', 64),
 ('oral cavity', 60),
 ('bone', 56),
 ('bronchus', 54),
 ('mammary gland', 53),
 ('retina', 49),
 ('breast tumor', 42),
 ('ileal mucosa', 40),
 ('kidney', 36),
 ('sarcomatiod', 36),
 ('pulmonary MALT lymphoma tissue', 32),
 ('pancreatic islet', 32),
 ('gastrointestinal tissue', 32),
 ('bronchial epithelium', 28),
 ('epidermis', 25),
 ('adrenal gland/cortex', 25),
 ('neuroblastic tumor', 24),
 ('cortex/proximal tubule', 24),
 ('head and neck', 22),
 ('bl

In [20]:
df.index = df['Source Name']

In [21]:
df.index

Index(['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4', 'Sample 5', 'Sample 6',
       'Sample 7', 'Sample 8', 'Sample 9', 'Sample 10',
       ...
       'Sample 27878', 'Sample 27879', 'Sample 27880', 'Sample 27881',
       'Sample 27882', 'Sample 27883', 'Sample 27884', 'Sample 27885',
       'Sample 27886', 'Sample 27887'],
      dtype='object', name='Source Name', length=27887)

## Check the sampling situation in the test data

A few good candidates are: acute myeloid leukaemia, breast cancer (partcularly breast adenocarcinoma) or other adenocarcinoma type. The next task is to figure out whether some experiments have more samples and how the sample look like in only test data sets. 

In [15]:
test_df = pd.read_pickle('/mnt/dzl_bioinf/binliu/deepRNA/data_all_samples/gene_level_train_test/all_samples_gene_level_test.pkl')

In [18]:
test_df.index

Index(['Sample 1633', 'Sample 11588', 'Sample 11771', 'Sample 8189',
       'Sample 25499', 'Sample 12023', 'Sample 13689', 'Sample 10572',
       'Sample 3420', 'Sample 2882',
       ...
       'Sample 6154', 'Sample 14011', 'Sample 27307', 'Sample 1905',
       'Sample 20540', 'Sample 19970', 'Sample 10688', 'Sample 25562',
       'Sample 23709', 'Sample 22715'],
      dtype='object', name='CompositeSequence Identifier', length=2787)

In [24]:
annotation_test_sub = df[df.index.isin(test_df.index)]

In [25]:
annotation_test_sub.shape

(2787, 30)

In [26]:
df.shape

(27887, 30)

In [27]:
test_df.shape

(2787, 23375)

In [28]:
Counter(annotation_test_sub['Characteristics[disease]']).most_common()

[('normal', 904),
 ('acute myeloid leukaemia', 105),
 ('chronic lymphocytic leukaemia', 78),
 ('breast cancer', 73),
 ('multiple myeloma', 52),
 ('diffuse large B-cell lymphoma', 51),
 ('breast adenocarcinoma', 49),
 ('ovarian carcinoma', 39),
 ('colorectal adenocarcinoma', 38),
 ('hepatocellular carcinoma', 37),
 ('neuroblastoma', 34),
 ('gastric cancer', 33),
 ('melanoma', 33),
 ('T-ALL', 29),
 ('lung adenocarcinoma', 29),
 ('multiple sclerosis or clinically isolated syndrome', 26),
 ('obesity', 26),
 ('Pre-B-ALL/c-ALL', 25),
 ('cervical adenocarcinoma', 23),
 ('asthma', 23),
 ('NSCLC', 22),
 ('myeloma', 22),
 ('glioblastoma', 21),
 ('periodontitis', 20),
 ('myelodysplastic syndrome', 19),
 ('primary breast tumor', 17),
 ('endometrial adenocarcinoma', 17),
 ('colon carcinoma', 16),
 ('lung cancer', 16),
 ('colon adenocarcinoma', 16),
 ('acute myeloid leukaemia ', 16),
 ('no rejection', 16),
 ('pancreatic cancer', 15),
 ('carcinoma', 14),
 ('allograft rejection', 14),
 ('acute monocyt

In [29]:
Counter(annotation_test_sub['Characteristics[tissue supergroup]']).most_common()

[('  ', 1900),
 ('lymphoid', 107),
 ('colorectal', 83),
 ('breast', 80),
 ('lung', 51),
 ('adipose tissue', 43),
 ('liver', 42),
 ('brain', 39),
 ('gastric tissue', 31),
 ('vastus lateralis', 28),
 ('skin', 26),
 ('cervix', 25),
 ('lymphoblastoid', 24),
 ('prostate', 24),
 ('colonic mucosa', 22),
 ('endometrium', 18),
 ('peripheral blood', 17),
 ('uterus', 16),
 ('embryonic kidney', 15),
 ('ovary', 14),
 ('pancreas', 11),
 ('subcutaneous adipose tissue', 10),
 ('stomach', 8),
 ('lung/bronchus', 8),
 ('pleura', 7),
 ('bronchus', 7),
 ('bone', 6),
 ('kidney', 6),
 ('oral cavity', 6),
 ('mammary gland', 5),
 ('cortex/proximal tubule', 5),
 ('breast tumor', 5),
 ('retina', 5),
 ('pancreatic islet', 4),
 ('adrenal gland/cortex', 4),
 ('cerebellum', 3),
 ('epidermis', 3),
 ('gastrointestinal tissue', 3),
 ('primary clear-cell renal cell carcinoma', 3),
 ('bronchial epithelium', 3),
 ('ileal mucosa', 3),
 ('blood', 3),
 ('esophagus', 2),
 ('esophageal epithelium', 2),
 ('central nervous syste

## Check the number of samples and select those with the highest number of samples

In [31]:
Counter(df['Comment [ArrayExpress accession]']).most_common()

[('E-GEOD-13159', 1252),
 ('array assay', 686),
 ('E-GEOD-7307', 461),
 ('E-GEOD-15061', 400),
 ('E-GEOD-8052', 317),
 ('E-GEOD-14468', 297),
 ('E-GEOD-9891', 284),
 ('E-GEOD-21374', 277),
 ('E-GEOD-13070', 269),
 ('E-GEOD-23120', 241),
 ('E-GEOD-21050', 239),
 ('E-GEOD-16214', 237),
 ('E-GEOD-3526', 234),
 ('E-TABM-325', 233),
 ('E-GEOD-21653', 228),
 ('E-GEOD-16134', 223),
 ('E-GEOD-19519', 222),
 ('E-GEOD-14333', 214),
 ('E-MTAB-54', 203),
 ('E-GEOD-26863', 201),
 ('E-GEOD-12276', 199),
 ('E-GEOD-11318', 195),
 ('E-GEOD-11135', 184),
 ('E-GEOD-15459', 182),
 ('E-GEOD-19784', 182),
 ('E-GEOD-17855', 181),
 ('E-GEOD-11882', 166),
 ('E-GEOD-11375', 150),
 ('E-GEOD-10843', 144),
 ('E-GEOD-10846', 135),
 ('E-GEOD-10780', 132),
 ('E-TABM-176', 131),
 ('E-GEOD-16879', 130),
 ('E-GEOD-23177', 116),
 ('E-GEOD-19743', 115),
 ('E-GEOD-19188', 114),
 ('E-MTAB-317', 114),
 ('E-TABM-1029', 114),
 ('E-GEOD-15396', 108),
 ('E-GEOD-24223', 106),
 ('E-GEOD-10890', 104),
 ('E-GEOD-8507', 104),
 ('E-GE

In [32]:
Counter(annotation_test_sub['Comment [ArrayExpress accession]']).most_common()

[('E-GEOD-13159', 119),
 ('array assay', 66),
 ('E-GEOD-7307', 43),
 ('E-GEOD-15061', 42),
 ('E-GEOD-8052', 37),
 ('E-GEOD-14468', 34),
 ('E-GEOD-9891', 33),
 ('E-GEOD-13070', 31),
 ('E-GEOD-21374', 30),
 ('E-GEOD-16214', 26),
 ('E-MTAB-54', 25),
 ('E-GEOD-23120', 24),
 ('E-GEOD-14333', 24),
 ('E-GEOD-21050', 24),
 ('E-GEOD-3526', 23),
 ('E-GEOD-15459', 23),
 ('E-GEOD-10843', 22),
 ('E-GEOD-11318', 22),
 ('E-GEOD-19784', 21),
 ('E-GEOD-11135', 21),
 ('E-GEOD-17855', 21),
 ('E-GEOD-16134', 20),
 ('E-GEOD-12276', 20),
 ('E-GEOD-16879', 18),
 ('E-GEOD-26863', 18),
 ('E-TABM-325', 18),
 ('E-GEOD-19519', 17),
 ('E-GEOD-23177', 17),
 ('E-GEOD-21653', 17),
 ('E-GEOD-11882', 15),
 ('E-GEOD-11375', 15),
 ('E-TABM-176', 13),
 ('E-GEOD-15935', 13),
 ('E-GEOD-10780', 13),
 ('E-GEOD-10846', 13),
 ('E-GEOD-16237', 13),
 ('E-GEOD-19188', 12),
 ('E-GEOD-19743', 12),
 ('E-GEOD-19475', 11),
 ('E-GEOD-24223', 11),
 ('E-GEOD-8507', 11),
 ('E-TABM-763', 11),
 ('E-GEOD-11869', 11),
 ('E-TABM-1029', 11),
 ('

Check whether some of the samples happen to overlap between the diseases/ organs of interest and have a reasonable number of samples in a good data set

In [33]:
test_anno = Counter(annotation_test_sub['Comment [ArrayExpress accession]']).most_common()

In [44]:
for sample_nr in test_anno:
    tem_sub1 = annotation_test_sub[annotation_test_sub['Comment [ArrayExpress accession]'] == sample_nr[0]]
    print(sample_nr[0] + ', ' + str(Counter(tem_sub1['Characteristics[disease]'])) \
        + '\n' + str(Counter(tem_sub1['Characteristics[organism part]'])))

E-GEOD-13159, Counter({'chronic lymphocytic leukaemia': 42, 'Pre-B-ALL/c-ALL': 25, 'T-ALL': 16, 'AML/other abnormalities': 10, 'acute myeloid leukaemia': 9, 'chronic myeloid leukaemia': 6, 'myelodysplastic syndrome': 3, 'ALL': 3, 'AML/MLL': 3, 'Pro-B-ALL/MLL': 2})
Counter({'bone marrow': 119})
array assay, Counter({'normal': 44, 'acute myeloid leukaemia': 6, 'lung adenocarcinoma': 2, 'hepatitis C virus-induced hepatocellular carcinoma': 2, 'myelodisplastic syndrome': 1, 'ALL with t(12': 1, 'AML with t 11q23': 1, "Crohn's disease": 1, 'breast ductal adenocarcinoma': 1, 'not applicable': 1, 'ejaculatory azoospermia/idiopathic infertility': 1, 'rheumatoid arthritis': 1, 'HCV': 1, 'Pro-B-ALL with t 11q23': 1, '-': 1, '  ': 1})
Counter({'lung': 14, 'gingival papillae': 9, '  ': 9, 'skin': 9, 'liver': 5, 'breast': 4, 'bone marrow': 3, 'prostate': 2, 'colon': 2, 'pancreas': 2, 'blood': 1, 'descending colon': 1, 'mammary epithelium': 1, 'testis': 1, 'stomach': 1, 'knee': 1, 'bone': 1})
E-GEOD-

The second selected disease will be acute myeloid leukaemia since there is a groupd of very good candidate in the test data set with reasonable numbers and good experimental design (bone marrow, normal versus disease). We can also add one more group to compare between breast adenocarcinoma and lung adenocarcinoma -- E-GEOD-15061