# Integrative analysis of pathway deregulation in obesity #

## Python implementation

### Extracting gene signature

1. We have to load the dataset produced and look for those genes that are more related to obese people;

In [1]:
### Imports

# Import std libraries
import os
from operator import itemgetter 
import re
import copy

# Import third party
import numpy as np
import scipy as sp
from numpy import *
import pandas as pd
import GEOparse
from scipy.stats import ks_2samp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

# import plotting tools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches

# Set logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logging.getLogger("GEOparse").setLevel(logging.WARNING)

plot_dir_path = './plots/'
if not os.path.exists(plot_dir_path):
    os.makedirs(plot_dir_path)
    print('created ./path directory')
    
# enable matplotlib inline
%matplotlib inline

In [2]:
df = pd.read_pickle('../2. batch-effect-evaluation/data/merged_df_no_batch_effect.pkl').T
v = (PCA().fit(df)).components_[0]
#df.head
v_df = pd.DataFrame(columns=["coef","abs_coef"], index = df.columns)
v_df.index.names = ["Entrez_Gene_ID"]
v_df["coef"] = v
v_df["abs_coef"] = np.abs(v)
v_df.sort_values(by="abs_coef",ascending=False,inplace=True)
v_df.drop("abs_coef",axis=1,inplace=True)
v_df.head()

Unnamed: 0_level_0,coef
Entrez_Gene_ID,Unnamed: 1_level_1
25975,-0.115041
7305,-0.093346
1880,-0.092337
4069,-0.07745
7076,-0.077426


In [3]:
sigma = 1/np.sqrt(df.shape[1])

idx = np.abs(v_df.coef)>5*sigma
print("genes beyond threshold:", idx.sum())
print("this is the threshold", sigma)

signature = v_df.loc[idx]
signature['coef'] = signature['coef'].abs()
signature = signature.sort_values(['coef'], ascending=True)
print(signature)
print(signature.index)

genes beyond threshold: 63
this is the threshold 0.00804465919403
                    coef
Entrez_Gene_ID          
28959           0.040268
5552            0.040287
10170           0.040523
55790           0.040553
85379           0.040648
10894           0.041157
5396            0.041168
4854            0.041670
4239            0.041875
2246            0.042014
115908          0.042207
1890            0.042481
8839            0.042723
1116            0.042764
1292            0.043251
3290            0.043373
9770            0.043577
7941            0.043853
7262            0.044132
80114           0.044165
83483           0.044174
2124            0.045086
6351            0.045342
4982            0.045913
3576            0.046464
2920            0.046656
2207            0.046714
714             0.046735
7107            0.047105
11326           0.047169
...                  ...
4332            0.048505
929             0.049278
11001           0.050846
1959            0.051589
219972   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
from_paper = [1278, 80763, 761, 219348, 25975, 2014, 6696, 1397, 1490, 22822, 1880, 171024, 1520, 80114, 115207, 151887, 22918, 389136, 8540, 7045, 25878, 2982, 2335, 7076, 5396, 4069, 8076, 3512, 10402, 3429, 83442, 712, 474344, 9457, 8470, 7037, 1291, 57863]
from_paper = [str(x) for x in from_paper]
ours = list(signature.index)
matching = [x for x in from_paper if x in ours]
print(len(matching))
print(matching)

all_our_genes = list(v_df.index)
matching = [x for x in from_paper if x not in all_our_genes]
print(len(matching))
print(matching)
print('1278' in all_our_genes)

15
['80763', '761', '25975', '2014', '6696', '1490', '1880', '1520', '80114', '25878', '7076', '5396', '4069', '8076', '3512']
0
[]
True


In [6]:
# export signature
import json
signature_genes = list(signature.index)
total_genes = list(v_df.index)

with open('data/signature.json', 'w') as outfile:
    json.dump(signature_genes, outfile)
    print("wrote signature in" ,"data/signature.json")
    
with open('data/total_genes.json', 'w') as outfile:
    json.dump(total_genes, outfile)
    print("wrote signature in" ,"data/total_genes.json")
    
signature.to_pickle('data/signature.pkl')
print("wrote signature in data/signature.pkl")

wrote signature in data/signature.json
wrote signature in data/total_genes.json
wrote signature in signature.pkl
