# Import requirements

In [5]:
from __future__ import division
import pandas as pd
import numpy as np
from collections import Counter
import time
import statsmodels.api as sm
import random
import matplotlib.pyplot as plt
%matplotlib inline

# Columns

In [44]:
sample_col = ['Patient','Visit']
demographics_col = ['Age', 'Gender', 'ART']
chemistry_col = ['VL','iVL','pVL','CD4','iCD4','nCD4','CD8','iCD8',u'nCD8']
neuro_col =['TMHDS','GDS']
sequence_col = ['Prot','AAPos','Coverage']
aa_col = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

In [2]:
def roll(sides, bias_list):
    assert len(bias_list) == sides
    number = random.uniform(0, sum(bias_list))
    current = 0
    for i, bias in enumerate(bias_list):
        current += bias
        if number <= current:
            return i + 1
        
def distributionCounts(coverage, aa_probs):
    dist = []
    for i in range(coverage):
        dist.append(map_dict[roll(20,aa_probs)])
    return Counter(dist)

In [150]:
aa_prob1 = [80,0,10,1,1,1,1,1,0,0,0,0,1,0,1,1,0,1,0,1]
aa_prob2 = [10,0,80,1,1,1,1,1,0,0,0,0,1,0,1,1,0,1,0,1]
print sum(aa_prob1), len(aa_prob1)
print sum(aa_prob2), len(aa_prob2)
aa_num = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
map_dict = dict(zip(aa_num,aa_col))

100 20
100 20


# Fill dictionary

In [173]:
data_dict = {'GDS':[],
             'A':[],'R':[],'N':[],'D':[],'C':[],
             'Q':[],'E':[],'G':[],'H':[],'I':[],
             'L':[],'K':[],'M':[],'F':[],'P':[],
             'S':[],'T':[],'W':[],'Y':[],'V':[]}

healthy_samples = 50
disease_samples = 50
others = 5
for i in range(healthy_samples):
    data_dict['GDS'].append(1)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob1)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)
    
for i in range(disease_samples):
    data_dict['GDS'].append(0)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob2)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)
        
for i in range(others):
    data_dict['GDS'].append(0)
    coverage = random.randint(100,1000)
    AACounts = distributionCounts(coverage, aa_prob1)
    c = sum(AACounts.values())
    for item in aa_col:
        #print (item, AACounts[item]/c),
        data_dict[item].append((AACounts[item]/c)*100)

In [174]:
sim_data = pd.DataFrame(data_dict)
print sim_data.shape
sim_data.head()

(105, 21)


Unnamed: 0,A,C,D,E,F,G,GDS,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,80.913978,1.075269,1.88172,2.419355,0,0.268817,1,0,0,0,...,0.537634,7.795699,0.268817,0.537634,0,1.075269,0,1.612903,1.612903,0
1,81.75,1.75,1.25,0.5,0,1.75,1,0,0,0,...,0.5,8.0,1.25,0.75,0,1.25,0,0.25,1.0,0
2,78.917379,0.854701,1.994302,1.139601,0,1.139601,1,0,0,0,...,1.139601,9.116809,0.854701,2.279202,0,1.709402,0,0.569801,0.2849,0
3,83.823529,0.735294,0.735294,0.735294,0,0.735294,1,0,0,0,...,0.735294,5.882353,1.470588,0.735294,0,1.470588,0,2.205882,0.735294,0
4,77.470356,1.185771,1.317523,1.185771,0,1.185771,1,0,0,0,...,0.658762,12.516469,0.658762,0.790514,0,0.527009,0,1.71278,0.790514,0


In [175]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def chooseDataCols(pos_df):
    cols = []
    cols.extend(['GDS'])
    cols.extend(aa_col)
    data = pos_df[cols].copy()
    data['intercept'] = 1.0
    return data

def filterX(X, threshold):
    t = X.max()>threshold
    #t = X.mean()>threshold
    filtered_cols = t[t==True].index.tolist()
    X = X[filtered_cols]
    return X

def getXandY(data, threshold):   
    train_cols = []
    train_cols.extend(aa_col)
    train_cols.extend(['intercept'])
    y = (data['GDS'] >= 0.5).astype(int)
    X = data[train_cols]
    X = filterX(X, threshold)
    return X, y

In [176]:
data = chooseDataCols(sim_data)
X, y = getXandY(data, 5)
# initialize and fit the model
logit = sm.Logit(y, X)
result = logit.fit()
print result.summary()

Optimization terminated successfully.
         Current function value: 0.159852
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:                    GDS   No. Observations:                  105
Model:                          Logit   Df Residuals:                      103
Method:                           MLE   Df Model:                            1
Date:                Tue, 29 Mar 2016   Pseudo R-squ.:                  0.7690
Time:                        21:13:23   Log-Likelihood:                -16.784
converged:                       True   LL-Null:                       -72.661
                                        LLR p-value:                 4.045e-26
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
A              0.0425      0.016      2.690      0.007         0.012     0.073
N             -0.1123      0