In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.stats import linregress,ks_2samp

In [2]:
# import signature
signature = pd.read_pickle('../3. signature-extraction/data/signature.pkl')

# import metadata of the samples
with open('./data/metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

# consider only the batches with bmi value
batches = []
for batch in ['GSE62117', 'GSE64567', 'GSE33526']:
    batch = pd.read_pickle('data/' + batch + '_table.pkl')
    batches.append(batch)

In [3]:
# preview of the signature datataframe
signature.head()

Unnamed: 0_level_0,coef
Entrez_Gene_ID,Unnamed: 1_level_1
28959,0.040268
5552,0.040287
10170,0.040523
55790,0.040553
85379,0.040648


In [4]:
def compute_score(batch_gene_data, signture):
    """
    Compute the BMI Score
    """
    genes_batch = np.array(batch_gene_data.index)
    gene_signature = np.array(signature.index)
    common_genes = np.intersect1d(genes_batch, gene_signature)
    raw_score = np.transpose(batch_gene_data.loc[common_genes]).dot(signature.loc[common_genes].coef)
    return raw_score - np.mean(raw_score)

# compute the bmi score for each batch
scores = []
for batch in batches:
    score = compute_score(batch, signature)
    scores.append(score)
    
batches[2]

Unnamed: 0_level_0,GSM775445_OU,GSM775455_LU,GSM775435_OU,GSM775475_OU,GSM775428_OU,GSM775426_OU,GSM775423_OU,GSM775454_OU,GSM775422_OU,GSM775485_OU,...,GSM775444_OU,GSM775450_OU,GSM775488_OU,GSM775477_LU,GSM775437_OU,GSM775467_OU,GSM775433_LU,GSM775419_OU,GSM775473_OU,GSM775483_OU
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-15.037663,-15.891283,-15.788109,-18.080490,-16.513182,-15.597980,-16.518187,-16.759702,-15.743172,-17.397168,...,-16.441248,-16.831866,-15.810035,-16.295250,-17.223260,-16.469726,-15.454425,-15.300830,-17.189104,-15.974414
10,-15.389902,-16.110594,-14.758826,-16.262336,-14.981221,-14.835586,-14.822532,-14.991786,-14.844764,-15.350858,...,-14.871294,-15.989888,-15.612403,-14.639043,-15.897120,-17.058316,-14.777037,-15.489032,-15.557788,-14.800907
100,-17.956902,-19.056594,-16.467826,-17.467336,-17.694221,-16.988586,-17.132532,-17.745786,-17.497764,-18.580858,...,-17.759294,-17.188888,-18.102403,-18.339043,-17.647120,-17.416316,-17.074037,-17.825032,-17.552788,-17.847907
1000,-19.132902,-20.290594,-17.744826,-19.994336,-20.818221,-17.720586,-18.483532,-19.977786,-15.325764,-20.870858,...,-20.161294,-19.942888,-20.164403,-19.210043,-19.392120,-18.603316,-18.202037,-18.995032,-20.189788,-20.535907
10000,-14.903765,-14.180285,-14.345730,-15.188489,-15.206197,-14.548398,-14.774772,-15.320183,-14.912813,-15.261826,...,-14.610311,-14.651577,-15.100333,-14.933026,-15.461638,-16.236971,-15.415903,-13.771033,-15.085740,-14.465935
100009676,-15.106218,-15.876608,-14.389929,-15.428304,-14.190003,-15.944186,-15.218537,-15.623811,-13.021958,-15.697971,...,-16.017700,-15.826841,-15.592605,-15.744234,-15.549956,-14.468157,-15.431802,-16.059815,-15.091452,-15.654010
10001,-16.214464,-15.432149,-15.298579,-16.250367,-16.078083,-15.903036,-16.129525,-15.896308,-13.606835,-16.124667,...,-15.537850,-15.982990,-15.756719,-15.781316,-15.995025,-16.456273,-15.666007,-15.530390,-15.960960,-15.273815
10002,-14.469272,-14.180377,-14.667572,-17.316423,-15.717379,-14.798461,-15.746101,-15.366322,-14.286898,-16.840947,...,-16.572051,-15.508293,-15.195107,-14.762056,-16.741266,-16.438168,-14.680434,-15.169500,-16.918140,-15.996005
10003,-14.461794,-16.974420,-12.778982,-12.057457,-14.290212,-14.835904,-14.415881,-13.737718,-17.412452,-14.004714,...,-14.863262,-14.509593,-14.803860,-14.223707,-14.054421,-16.230907,-15.318727,-14.925732,-14.346413,-14.503179
100037280,-16.045902,-15.528594,-14.823826,-15.418336,-15.347221,-15.728586,-15.421532,-15.596786,-13.737764,-15.575858,...,-16.065294,-16.267888,-15.921403,-14.608043,-16.122120,-15.995316,-15.493037,-16.314032,-15.496788,-15.439907


In [5]:
# print the score and perform the lineregression
for i, batch in enumerate(batches):
    score = scores[i]
    print("=== result for batch number === ", i)
    bmis = [metadata[x.split('_')[0]]['bmi'] for x in list(batch.columns)]
    print("p=", linregress(bmis, score).pvalue)
    print("R=", linregress(bmis, score).rvalue)

=== result for batch number ===  0
p= 1.19809188651e-08
R= 0.527772528922
=== result for batch number ===  1
p= 3.45397633342e-05
R= 0.493264174659
=== result for batch number ===  2
p= 0.0491217134655
R= 0.232754306838
