# Integrative analysis of pathway deregulation in obesity #

## Python implementation

### Extracting gene signature

1. We have to load the dataset produced and look for those genes that are more related to obese people;

In [1]:
### Imports

# Import std libraries
import os
from operator import itemgetter 
import re
import copy

# Import third party
import numpy as np
import scipy as sp
from numpy import *
import pandas as pd
import GEOparse
from scipy.stats import ks_2samp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

# import plotting tools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches

# Set logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logging.getLogger("GEOparse").setLevel(logging.WARNING)

plot_dir_path = './plots/'
if not os.path.exists(plot_dir_path):
    os.makedirs(plot_dir_path)
    print('created ./path directory')
    
# enable matplotlib inline
%matplotlib inline

In [2]:
df = pd.read_pickle('../8. project-batch-effect-evaluation/data/merged_df_no_batch_effect_project.pkl').T
v = (PCA().fit(df)).components_[0]
#df.head
v_df = pd.DataFrame(columns=["coef","abs_coef"], index = df.columns)
v_df.index.names = ["Entrez_Gene_ID"]
v_df["coef"] = v
v_df["abs_coef"] = np.abs(v)
v_df.sort_values(by="abs_coef",ascending=False,inplace=True)
v_df.drop("abs_coef",axis=1,inplace=True)
v_df.head()

Unnamed: 0_level_0,coef
Entrez_Gene_ID,Unnamed: 1_level_1
3117,0.062199
6192,0.057501
7503,-0.052744
8284,0.047435
5146,0.045797


In [3]:
sigma = 1/np.sqrt(df.shape[1])

idx = np.abs(v_df.coef)>5*sigma
print("genes beyond threshold:", idx.sum())
print("this is the threshold", sigma)

signature = v_df.loc[idx]
signature['coef'] = signature['coef'].abs()
signature = signature.sort_values(['coef'], ascending=True)
print(signature)
print(signature.index)

genes beyond threshold: 14
this is the threshold 0.00760615739485
                    coef
Entrez_Gene_ID          
254773          0.038898
79993           0.039066
735             0.039283
54620           0.040465
167681          0.040514
27289           0.040727
8853            0.041137
8287            0.045354
9086            0.045637
5146            0.045797
8284            0.047435
7503            0.052744
6192            0.057501
3117            0.062199
Index(['254773', '79993', '735', '54620', '167681', '27289', '8853', '8287',
       '9086', '5146', '8284', '7503', '6192', '3117'],
      dtype='object', name='Entrez_Gene_ID')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
from_paper = [1278, 80763, 761, 219348, 25975, 2014, 6696, 1397, 1490, 22822, 1880, 171024, 1520, 80114, 115207, 151887, 22918, 389136, 8540, 7045, 25878, 2982, 2335, 7076, 5396, 4069, 8076, 3512, 10402, 3429, 83442, 712, 474344, 9457, 8470, 7037, 1291, 57863]
from_paper = [str(x) for x in from_paper]
ours = list(signature.index)
matching = [x for x in from_paper if x in ours]
print(len(matching))
print(matching)

all_our_genes = list(v_df.index)
matching = [x for x in from_paper if x not in all_our_genes]
print(len(matching))
print(matching)
print('1278' in all_our_genes)

0
[]
0
[]
True


In [5]:
# export signature
import json
signature_genes = list(signature.index)
total_genes = list(v_df.index)

with open('data/signature.json', 'w') as outfile:
    json.dump(signature_genes, outfile)
    print("wrote signature in" ,"data/signature.json")
    
with open('data/total_genes.json', 'w') as outfile:
    json.dump(total_genes, outfile)
    print("wrote signature in" ,"data/total_genes.json")
    
signature.to_pickle('data/signature.pkl')
print("wrote signature in signature.pkl")

wrote signature in data/signature.json
wrote signature in data/total_genes.json
wrote signature in signature.pkl
