# Import Requirements

In [99]:
from __future__ import division
import pandas as pd
import numpy as np
from collections import Counter
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

# Import data

In [23]:
sample_col = ['Patient','Visit']
demographics_col = ['Age', 'Gender', 'ART']
chemistry_col = ['VL','iVL','pVL','CD4','iCD4','nCD4','CD8','iCD8',u'nCD8']
neuro_col =['TMHDS','GDS']
sequence_col = ['Prot','AAPos','Coverage']
aa_col = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

DF = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/merged_data.csv')
DF.head()

Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.006934,0.001695,0.565052,2.6e-05,0,0.000848,0.000128,0.416637,0,0.000205
1,A0010,R08,2014-11-12,59,Male,on,20,50,470,1167,...,0.005848,0.005848,0.584795,0.0,0,0.0,0.0,0.403509,0,0.0
2,A0013,R09,2014-11-10,68,Male,on,20,144,39373,771,...,0.001661,0.004983,0.282392,0.0,0,0.001661,0.0,0.209302,0,0.0
3,A0019,R12,2015-02-02,46,Female,on,20,99,164020,1130,...,0.008081,0.003367,0.526599,0.0,0,0.001347,0.000673,0.452525,0,0.0
4,A0026,R09,2015-04-30,55,Male,on,67859,100000,100000,137,...,0.008982,0.0,0.58982,0.0,0,0.002994,0.0,0.389222,0,0.0


# Define functions

In [157]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def chooseDataCols(pos_df):
    cols = []
    cols.extend(sample_col)
    cols.extend(neuro_col)
    cols.extend(aa_col)
    data = pos_df[cols].copy()
    data['intercept'] = 1.0
    return data

def filterX(X, threshold):
    t = X.max()>threshold
    #t = X.mean()>threshold
    filtered_cols = t[t==True].index.tolist()
    X = X[filtered_cols]
    return X

def getXandY(data, threshold):   
    train_cols = []
    train_cols.extend(aa_col)
    train_cols.extend(['intercept'])
    y = (data['GDS'] >= 0.5).astype(int)
    X = data[train_cols]
    X = filterX(X, threshold)
    return X, y

# One position example

In [225]:
pos_df = slice_position(DF,61)
data = chooseDataCols(pos_df)
X, y = getXandY(data, 0.05)
X = X*100
X['intercept'] = 1
# initialize and fit the model
logit = sm.Logit(y, X)
result = logit.fit()
print result.summary()
#print result.params
#print result.conf_int()
#for i,row in result.conf_int().iterrows():
#    print i,row[0],row[1]

Optimization terminated successfully.
         Current function value: 0.508836
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:                    GDS   No. Observations:                   53
Model:                          Logit   Df Residuals:                       45
Method:                           MLE   Df Model:                            7
Date:                Tue, 29 Mar 2016   Pseudo R-squ.:                  0.2657
Time:                        21:24:01   Log-Likelihood:                -26.968
converged:                       True   LL-Null:                       -36.727
                                        LLR p-value:                  0.006710
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
N              3.8777      1.921      2.018      0.044         0.112     7.643
D              3.8833      1

# All positions

In [197]:
tat1_length=72
for i in range(1,tat1_length+1):
    #slice the data
    pos_df = slice_position(DF, i)
    
    # prep the data
    data = chooseDataCols(pos_df)
    X, y = getXandY(data,0.05)
    print (i, X.shape, y.shape)

    # initialize and fit the model
    try:
        logit = sm.Logit(y, X)
        result = logit.fit()
        print result.conf_int()
    except:
        continue
    print ''

(1, (54, 9), (54,))
Optimization terminated successfully.
         Current function value: 0.602671
         Iterations 16
                     0            1
N         -2533.925134  4870.476837
Q          -105.994948  1192.060960
E          -155.741008   547.950677
I          -149.924192   523.250377
L          -713.813490   542.843303
M          -141.388858   531.713963
S          -149.159939   524.285749
W          -150.132849   523.708645
intercept  -523.069479   146.639008

(2, (54, 11), (54,))
         Current function value: inf
         Iterations: 35
(3, (55, 5), (55,))
Optimization terminated successfully.
         Current function value: 0.672441
         Iterations 7
                    0           1
Q         -214.729646  201.259176
K         -242.417032  295.996764
P         -212.734075  198.587698
V         -217.173826  210.899370
intercept -198.286303  211.818044

(4, (55, 6), (55,))
Optimization terminated successfully.
         Current function value: 0.665133
       




         Current function value: 0.665017
         Iterations: 35
                         0                1
N            -32700.733517     32733.235411
C              -107.515694        70.568622
K         -18880951.839893  18881732.574282
T              -113.252847        83.014642
intercept       -69.898128       106.333397

(23, (55, 5), (55,))
Optimization terminated successfully.
         Current function value: 0.620556
         Iterations 10
                    0            1
N          -43.766116   421.673905
K         -174.533916  1424.462548
P         -225.378600   411.630667
T          -42.242110   422.378806
intercept -419.912161    42.435857

(24, (55, 10), (55,))
Optimization terminated successfully.
         Current function value: 0.587378
         Iterations 11
                    0           1
A         -270.846176  134.829645
N         -272.041553  135.984815
C         -261.079092  143.823734
Q         -274.003689  153.044674
H         -366.967809  147.798509
K   

