<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

# Tools&Functions

In [2]:
import pandas as pd
import numpy as np
import pickle
import os

from itertools import product
from sklearn.preprocessing import normalize
from scipy.stats import ttest_ind

In [3]:
os.chdir('./data/')

In [4]:
def make_t_table(data, rows_normalized = False, cols_standardized = False):
    
    #row normalization
    if rows_normalized == True:
        data_normalized = normalize(data, axis = 1, norm = 'l1')
        
    elif rows_normalized == False:
        data_normalized = data.values
    
    #column standardization
    if cols_standardized == True:
            sample_mean = data_normalized.mean(axis=0)                                                                                                                                                                 
            sample_std = data_normalized.std(axis=0) 
            data_normalized = (data_normalized - sample_mean) / sample_std
    
    # Make overall dataframe    
    df = pd.DataFrame(data_normalized, columns = kmer_names)
    df['Status'] = labels
    
    # Get top 20 most informative kmers    
    imp_feats_sub = imp_feats_df.head(n=20)
    feats = imp_feats_sub['Kmer'].tolist()
    df_impt = pd.DataFrame(df, columns = feats)
    df_impt['Status'] = labels
        
    # Split into healthy and diseased dataframes
    df_healthy = df_impt.loc[df_impt['Status'] == 0.0]
    df_disease = df_impt.loc[df_impt['Status'] == 1.0]

    # Get means
    healthy_means = df_healthy.iloc[:,:20].mean()
    disease_means = df_disease.iloc[:,:20].mean()

    # Get t statistic and p value
    healthy_data = df_healthy.iloc[:,:20]
    disease_data = df_disease.iloc[:,:20]
    t, p = ttest_ind(healthy_data, disease_data)
    
    # Make the table
    summaries = {'Healthy': healthy_means, 'Disease': disease_means, 'Tstat': t, 'Pvalue': p, 'Kmers': feats}
    df_summ = pd.DataFrame(data=summaries)
    df_summ = df_summ[['Kmers', 'Healthy', 'Disease', 'Tstat', 'Pvalue']]
    
    return df_summ

In [5]:
# List of 10mer names
kmer_names = []
with open("10mer_dictionary") as text:
    for line in text:
        line = line.rstrip("\n")
        kmer_names.append(line)

# MetaHIT

## Loading and prepping data

In [6]:
# Most important features
imp_feats = pd.read_csv('metahit_important_features.txt', sep = '\t')
imp_feats = imp_feats.iloc[:,0:2]
imp_feats_df = imp_feats.rename(index=str, columns={"Importances": "Kmer", "for": "Score"})
imp_feats_df.head()

Unnamed: 0,Kmer,Score
0,AATGGAAAGG,0.000895
1,CCTCTTTCAG,0.00077
2,CTACAAAAAG,0.000724
3,ATGGAAAGGA,0.000689
4,ACCAAAGCGT,0.00066


In [7]:
# Loaded kmer abundances
kmers = pd.read_pickle("metahit10mers.pickle")

# Disease labels
labels = pd.read_csv("metahit10mers.csv")

## T stats

In [8]:
make_t_table(kmers)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,8200.735849,7600.1875,0.536748,0.591904
CCTCTTTCAG,CCTCTTTCAG,8115.051887,8328.375,-0.376479,0.70687
CTACAAAAAG,CTACAAAAAG,6016.353774,6238.083333,-0.54013,0.589573
ATGGAAAGGA,ATGGAAAGGA,9396.957547,8966.104167,0.406446,0.684752
ACCAAAGCGT,ACCAAAGCGT,2463.377358,2409.5625,0.202448,0.839726
TCCTCCAAAA,TCCTCCAAAA,7248.429245,7553.145833,-0.668595,0.504351
CCTTTTGGAG,CCTTTTGGAG,3929.740566,3693.104167,0.442965,0.658162
ATTCCTTTCC,ATTCCTTTCC,7890.872642,7723.479167,0.193023,0.847093
TGGAAGGGAA,TGGAAGGGAA,4832.589623,4855.791667,-0.062371,0.950316
ACTCCATTCC,ACTCCATTCC,3772.783019,3115.208333,0.75843,0.448886


In [9]:
make_t_table(kmers, rows_normalized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,6.410194e-06,5.553354e-06,0.877938,0.380794
CCTCTTTCAG,CCTCTTTCAG,6.140742e-06,6.045576e-06,0.349511,0.726991
CTACAAAAAG,CTACAAAAAG,4.570736e-06,4.517308e-06,0.196626,0.844275
ATGGAAAGGA,ATGGAAAGGA,7.285265e-06,6.514351e-06,0.824673,0.410319
ACCAAAGCGT,ACCAAAGCGT,1.86557e-06,1.779243e-06,0.536962,0.591756
TCCTCCAAAA,TCCTCCAAAA,5.474089e-06,5.482569e-06,-0.039923,0.968186
CCTTTTGGAG,CCTTTTGGAG,2.982488e-06,2.715019e-06,0.787017,0.431994
ATTCCTTTCC,ATTCCTTTCC,6.114499e-06,5.583687e-06,0.666348,0.505784
TGGAAGGGAA,TGGAAGGGAA,3.748292e-06,3.507865e-06,0.746777,0.455878
ACTCCATTCC,ACTCCATTCC,3.023029e-06,2.265584e-06,0.865524,0.387555


In [10]:
make_t_table(kmers, rows_normalized=True, cols_standardized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,0.026047,-0.11489,0.877938,0.380794
CCTCTTTCAG,CCTCTTTCAG,0.012627,-0.043514,0.349511,0.726991
CTACAAAAAG,CTACAAAAAG,0.006068,-0.025541,0.196626,0.844275
ATGGAAAGGA,ATGGAAAGGA,0.024409,-0.108,0.824673,0.410319
ACCAAAGCGT,ACCAAAGCGT,0.015785,-0.070494,0.536962,0.591756
TCCTCCAAAA,TCCTCCAAAA,-0.004105,0.002306,-0.039923,0.968186
CCTTTTGGAG,CCTTTTGGAG,0.022609,-0.103761,0.787017,0.431994
ATTCCTTTCC,ATTCCTTTCC,0.020294,-0.086739,0.666348,0.505784
TGGAAGGGAA,TGGAAGGGAA,0.022391,-0.097539,0.746777,0.455878
ACTCCATTCC,ACTCCATTCC,0.026231,-0.112712,0.865524,0.387555


# Karlsson

## Loading and prepping data

In [11]:
# Most important features
imp_feats = pd.read_csv('importances_karlsson_linear_best_model.txt', sep = '\t', header = None)

# Loaded kmer abundances
kmers = pd.read_pickle("karlsson10mers.pickle")

# Disease labels
labels = pd.read_csv("karlsson10mers.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## T stats

In [12]:
make_t_table(kmers)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,12958.857143,11508.603774,1.450355,0.150324
CCTCTTTCAG,CCTCTTTCAG,14917.571429,14377.641509,0.418144,0.676806
CTACAAAAAG,CTACAAAAAG,10273.666667,10327.415094,-0.0522,0.958482
ATGGAAAGGA,ATGGAAAGGA,14264.785714,13636.924528,0.52633,0.599912
ACCAAAGCGT,ACCAAAGCGT,3851.452381,3689.377358,0.498865,0.619052
TCCTCCAAAA,TCCTCCAAAA,13305.404762,13430.301887,-0.101906,0.919051
CCTTTTGGAG,CCTTTTGGAG,7536.52381,6356.45283,1.766672,0.080564
ATTCCTTTCC,ATTCCTTTCC,11967.166667,11835.132075,0.127283,0.898991
TGGAAGGGAA,TGGAAGGGAA,8883.309524,8527.433962,0.471409,0.638453
ACTCCATTCC,ACTCCATTCC,3928.452381,4006.226415,-0.206026,0.837221


In [13]:
make_t_table(kmers, rows_normalized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,4.473514e-06,4.41793e-06,0.418637,0.676447
CCTCTTTCAG,CCTCTTTCAG,5.215417e-06,5.546206e-06,-1.325328,0.188309
CTACAAAAAG,CTACAAAAAG,3.691305e-06,4.056336e-06,-1.360917,0.176829
ATGGAAAGGA,ATGGAAAGGA,4.991834e-06,5.290302e-06,-1.267848,0.208017
ACCAAAGCGT,ACCAAAGCGT,1.346446e-06,1.426148e-06,-1.23069,0.221543
TCCTCCAAAA,TCCTCCAAAA,4.703693e-06,5.176594e-06,-1.759354,0.081806
CCTTTTGGAG,CCTTTTGGAG,2.56356e-06,2.360178e-06,2.359101,0.020411
ATTCCTTTCC,ATTCCTTTCC,4.209896e-06,4.599938e-06,-1.795687,0.075792
TGGAAGGGAA,TGGAAGGGAA,3.093444e-06,3.32095e-06,-1.465137,0.146256
ACTCCATTCC,ACTCCATTCC,1.378963e-06,1.561626e-06,-2.08699,0.039625


In [14]:
make_t_table(kmers, rows_normalized=True, cols_standardized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,0.043474,-0.044188,0.418637,0.676447
CCTCTTTCAG,CCTCTTTCAG,-0.161062,0.113734,-1.325328,0.188309
CTACAAAAAG,CTACAAAAAG,-0.162382,0.120155,-1.360917,0.176829
ATGGAAAGGA,ATGGAAAGGA,-0.154186,0.108972,-1.267848,0.208017
ACCAAAGCGT,ACCAAAGCGT,-0.139377,0.116707,-1.23069,0.221543
TCCTCCAAAA,TCCTCCAAAA,-0.209127,0.153322,-1.759354,0.081806
CCTTTTGGAG,CCTTTTGGAG,0.274172,-0.205876,2.359101,0.020411
ATTCCTTTCC,ATTCCTTTCC,-0.215005,0.154063,-1.795687,0.075792
TGGAAGGGAA,TGGAAGGGAA,-0.173742,0.130024,-1.465137,0.146256
ACTCCATTCC,ACTCCATTCC,-0.242965,0.184784,-2.08699,0.039625


# Qin

## Loading and prepping data

In [15]:
# Most important features
overall = pd.read_csv('other_top_100_feat_imps.txt', sep = ' ')

In [16]:
#just get Qin data
imp_feats_Qin = overall.iloc[:101,:]


imp_feats_Qin = imp_feats_Qin.iloc[:,0:1]
imp_feats_Qin = imp_feats_Qin.iloc[1:,:]
imp_feats_Qin['Kmer'], imp_feats_Qin['Score'] = imp_feats_Qin['SORTING'].str.split('\t', 1).str
imp_feats_Qin = imp_feats_Qin.drop(['SORTING'], axis = 1)

In [17]:
# Loaded kmer abundances
kmers = pd.read_pickle("Qin10mers.pickle")

# Disease labels
labels = pd.read_csv("Qin10mers.csv")

## T stats

In [18]:
make_t_table(kmers)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,16888.508108,17316.942857,-0.627378,0.530811
CCTCTTTCAG,CCTCTTTCAG,21467.913514,21953.868571,-0.542435,0.587856
CTACAAAAAG,CTACAAAAAG,12917.0,13069.48,-0.277918,0.781236
ATGGAAAGGA,ATGGAAAGGA,19932.610811,20489.005714,-0.683984,0.494428
ACCAAAGCGT,ACCAAAGCGT,6026.205405,6319.834286,-0.750897,0.453208
TCCTCCAAAA,TCCTCCAAAA,16195.448649,16408.097143,-0.307139,0.758916
CCTTTTGGAG,CCTTTTGGAG,8905.216216,9290.731429,-0.756903,0.449606
ATTCCTTTCC,ATTCCTTTCC,17525.135135,17789.005714,-0.366656,0.714092
TGGAAGGGAA,TGGAAGGGAA,10771.508108,10909.142857,-0.299743,0.764547
ACTCCATTCC,ACTCCATTCC,7722.275676,7814.028571,-0.257442,0.796986


In [19]:
make_t_table(kmers, rows_normalized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,5.503177e-06,5.540207e-06,-0.366642,0.714103
CCTCTTTCAG,CCTCTTTCAG,6.919355e-06,7.005883e-06,-0.647394,0.517792
CTACAAAAAG,CTACAAAAAG,4.14585e-06,4.229464e-06,-0.979271,0.328107
ATGGAAAGGA,ATGGAAAGGA,6.441562e-06,6.553782e-06,-0.998555,0.318685
ACCAAAGCGT,ACCAAAGCGT,1.904e-06,2.027614e-06,-1.499242,0.134692
TCCTCCAAAA,TCCTCCAAAA,5.233415e-06,5.282004e-06,-0.445864,0.655965
CCTTTTGGAG,CCTTTTGGAG,2.894762e-06,2.970497e-06,-0.691705,0.489571
ATTCCTTTCC,ATTCCTTTCC,5.695917e-06,5.6966e-06,-0.006204,0.995054
TGGAAGGGAA,TGGAAGGGAA,3.47786e-06,3.479729e-06,-0.02921,0.976714
ACTCCATTCC,ACTCCATTCC,2.501584e-06,2.494003e-06,0.112589,0.91042


In [20]:
make_t_table(kmers, rows_normalized=True, cols_standardized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,-0.015323,0.023407,-0.366642,0.714103
CCTCTTTCAG,CCTCTTTCAG,-0.032954,0.035558,-0.647394,0.517792
CTACAAAAAG,CTACAAAAAG,-0.051141,0.052405,-0.979271,0.328107
ATGGAAAGGA,ATGGAAAGGA,-0.050131,0.055433,-0.998555,0.318685
ACCAAAGCGT,ACCAAAGCGT,-0.076292,0.081957,-1.499242,0.134692
TCCTCCAAAA,TCCTCCAAAA,-0.024052,0.023138,-0.445864,0.655965
CCTTTTGGAG,CCTTTTGGAG,-0.034172,0.038999,-0.691705,0.489571
ATTCCTTTCC,ATTCCTTTCC,0.000414,0.001071,-0.006204,0.995054
TGGAAGGGAA,TGGAAGGGAA,0.001077,0.004166,-0.02921,0.976714
ACTCCATTCC,ACTCCATTCC,0.008073,-0.003838,0.112589,0.91042


# RA

## Loading and prepping data

In [21]:
# Most important features
#just get RA data
imp_feats_RA = overall.iloc[204:,:]
imp_feats_RA = imp_feats_RA.iloc[:,0:1]
imp_feats_RA = imp_feats_RA.iloc[1:,:]
imp_feats_RA['Kmer'], imp_feats_RA['Score'] = imp_feats_RA['SORTING'].str.split('\t', 1).str
imp_feats_RA = imp_feats_RA.drop(['SORTING'], axis = 1)

In [22]:
# Loaded kmer abundances
kmers = pd.read_pickle("RA10mers.pickle")

# Disease labels
labels = pd.read_csv("RA10mers.csv")

## T stats

In [23]:
make_t_table(kmers)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,28537.176471,30657.286822,-2.186089,0.029822
CCTCTTTCAG,CCTCTTTCAG,36136.176471,37290.775194,-0.984279,0.326017
CTACAAAAAG,CTACAAAAAG,24684.509804,25187.263566,-0.709827,0.478533
ATGGAAAGGA,ATGGAAAGGA,33749.45098,36123.379845,-2.234059,0.026445
ACCAAAGCGT,ACCAAAGCGT,9214.0,9444.085271,-0.622058,0.534523
TCCTCCAAAA,TCCTCCAAAA,30722.068627,31284.023256,-0.572035,0.567859
CCTTTTGGAG,CCTTTTGGAG,14417.539216,14868.069767,-0.542448,0.588037
ATTCCTTTCC,ATTCCTTTCC,29246.22549,31228.317829,-1.934924,0.054231
TGGAAGGGAA,TGGAAGGGAA,17275.980392,18303.217054,-1.954274,0.051886
ACTCCATTCC,ACTCCATTCC,13115.352941,13529.100775,-0.612801,0.540616


In [24]:
make_t_table(kmers, rows_normalized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,5.072374e-06,5.343522e-06,-2.163595,0.03153
CCTCTTTCAG,CCTCTTTCAG,6.417865e-06,6.493163e-06,-0.532167,0.595126
CTACAAAAAG,CTACAAAAAG,4.392856e-06,4.396599e-06,-0.044399,0.964625
ATGGAAAGGA,ATGGAAAGGA,6.002097e-06,6.300338e-06,-2.231387,0.026624
ACCAAAGCGT,ACCAAAGCGT,1.643395e-06,1.646276e-06,-0.051235,0.959183
TCCTCCAAAA,TCCTCCAAAA,5.462307e-06,5.456342e-06,0.045523,0.96373
CCTTTTGGAG,CCTTTTGGAG,2.566681e-06,2.588208e-06,-0.159469,0.87344
ATTCCTTTCC,ATTCCTTTCC,5.180647e-06,5.44544e-06,-1.966358,0.050465
TGGAAGGGAA,TGGAAGGGAA,3.072031e-06,3.203067e-06,-1.879647,0.061426
ACTCCATTCC,ACTCCATTCC,2.338886e-06,2.351186e-06,-0.119158,0.905254


In [25]:
make_t_table(kmers, rows_normalized=True, cols_standardized=True)

Unnamed: 0,Kmers,Healthy,Disease,Tstat,Pvalue
AATGGAAAGG,AATGGAAAGG,-0.15473,0.130264,-2.163595,0.03153
CCTCTTTCAG,CCTCTTTCAG,-0.038475,0.032442,-0.532167,0.595126
CTACAAAAAG,CTACAAAAAG,-0.00411,0.00181,-0.044399,0.964625
ATGGAAAGGA,ATGGAAAGGA,-0.162019,0.132205,-2.231387,0.026624
ACCAAAGCGT,ACCAAAGCGT,-0.0014,0.005428,-0.051235,0.959183
TCCTCCAAAA,TCCTCCAAAA,0.003265,-0.002806,0.045523,0.96373
CCTTTTGGAG,CCTTTTGGAG,-0.010424,0.010837,-0.159469,0.87344
ATTCCTTTCC,ATTCCTTTCC,-0.14281,0.117082,-1.966358,0.050465
TGGAAGGGAA,TGGAAGGGAA,-0.134735,0.113584,-1.879647,0.061426
ACTCCATTCC,ACTCCATTCC,-0.005718,0.010155,-0.119158,0.905254
