# Import requirements

In [5]:
from __future__ import division
import pandas as pd
import numpy as np
from collections import Counter
import time
import statsmodels.api as sm
import random
import matplotlib.pyplot as plt
%matplotlib inline

# Columns

In [44]:
sample_col = ['Patient','Visit']
demographics_col = ['Age', 'Gender', 'ART']
chemistry_col = ['VL','iVL','pVL','CD4','iCD4','nCD4','CD8','iCD8',u'nCD8']
neuro_col =['TMHDS','GDS']
sequence_col = ['Prot','AAPos','Coverage']
aa_col = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

In [2]:
def roll(sides, bias_list):
    assert len(bias_list) == sides
    number = random.uniform(0, sum(bias_list))
    current = 0
    for i, bias in enumerate(bias_list):
        current += bias
        if number <= current:
            return i + 1
        
def distributionCounts(coverage, aa_probs):
    dist = []
    for i in range(coverage):
        dist.append(map_dict[roll(20,aa_probs)])
    return Counter(dist)

In [177]:
aa_prob1 = [80,0,1,10,1,1,1,1,0,0,0,0,1,0,1,1,0,1,0,1]
aa_prob2 = [10,0,80,1,1,1,1,1,0,0,0,0,1,0,1,1,0,1,0,1]
print sum(aa_prob1), len(aa_prob1)
print sum(aa_prob2), len(aa_prob2)
aa_num = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
map_dict = dict(zip(aa_num,aa_col))

100 20
100 20


# Fill dictionary

In [182]:
data_dict = {'GDS':[],
             'A':[],'R':[],'N':[],'D':[],'C':[],
             'Q':[],'E':[],'G':[],'H':[],'I':[],
             'L':[],'K':[],'M':[],'F':[],'P':[],
             'S':[],'T':[],'W':[],'Y':[],'V':[]}

healthy_samples = 1000
disease_samples = 1500
others = 5
for i in range(healthy_samples):
    data_dict['GDS'].append(1)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob1)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)
    
for i in range(disease_samples):
    data_dict['GDS'].append(0)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob2)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)
        
for i in range(others):
    data_dict['GDS'].append(0)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob1)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)

In [183]:
sim_data = pd.DataFrame(data_dict)
print sim_data.shape
sim_data.head()

(2505, 21)


Unnamed: 0,A,C,D,E,F,G,GDS,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,79.907621,0.692841,8.775982,0.692841,0,1.385681,1,0,0,0,...,1.154734,0.692841,1.616628,1.616628,0,0.923788,0,1.385681,1.154734,0
1,80.569514,0.670017,8.877722,1.005025,0,1.172529,1,0,0,0,...,1.005025,1.172529,0.502513,1.005025,0,1.005025,0,1.842546,1.172529,0
2,80.813953,0.872093,9.883721,0.581395,0,0.581395,1,0,0,0,...,1.162791,0.872093,0.581395,1.453488,0,1.162791,0,0.581395,1.453488,0
3,75.121951,0.487805,11.219512,2.439024,0,2.439024,1,0,0,0,...,1.463415,0.487805,0.97561,0.97561,0,0.97561,0,1.463415,1.95122,0
4,77.302632,0.986842,9.210526,1.809211,0,0.986842,1,0,0,0,...,1.480263,2.138158,1.315789,1.151316,0,2.138158,0,1.151316,0.328947,0


In [184]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def chooseDataCols(pos_df):
    cols = []
    cols.extend(['GDS'])
    cols.extend(aa_col)
    data = pos_df[cols].copy()
    data['intercept'] = 1.0
    return data

def filterX(X, threshold):
    t = X.max()>threshold
    #t = X.mean()>threshold
    filtered_cols = t[t==True].index.tolist()
    X = X[filtered_cols]
    return X

def getXandY(data, threshold):   
    train_cols = []
    train_cols.extend(aa_col)
    train_cols.extend(['intercept'])
    y = (data['GDS'] >= 0.5).astype(int)
    X = data[train_cols]
    X = filterX(X, threshold)
    return X, y

In [185]:
data = chooseDataCols(sim_data)
X, y = getXandY(data, 5)
# initialize and fit the model
logit = sm.Logit(y, X)
result = logit.fit()
print result.summary()

Optimization terminated successfully.
         Current function value: 0.012114
         Iterations 17
                           Logit Regression Results                           
Dep. Variable:                    GDS   No. Observations:                 2505
Model:                          Logit   Df Residuals:                     2500
Method:                           MLE   Df Model:                            4
Date:                Tue, 29 Mar 2016   Pseudo R-squ.:                  0.9820
Time:                        21:49:08   Log-Likelihood:                -30.346
converged:                       True   LL-Null:                       -1685.1
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
A              0.1000      0.034      2.941      0.003         0.033     0.167
N             -0.4256      0