# Import Requirements

In [3]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import data

In [7]:
DF = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/complete_data.csv')
print DF.shape
DF.head()

(3900, 26)


Unnamed: 0,Patient,Visit,GDS,Prot,AAPos,Coverage,A,R,N,D,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,0.583333,Tat1,1,38938,5.1e-05,0.002157,5.1e-05,7.7e-05,...,0.006934,0.001695,0.565052,2.6e-05,0,0.000848,0.000128,0.416637,0,0.000205
1,A0010,R08,1.416667,Tat1,1,171,0.0,0.0,0.0,0.0,...,0.005848,0.005848,0.584795,0.0,0,0.0,0.0,0.403509,0,0.0
2,A0013,R09,0.0,Tat1,1,602,0.0,0.001661,0.0,0.0,...,0.001661,0.004983,0.282392,0.0,0,0.001661,0.0,0.209302,0,0.0
3,A0019,R12,0.25,Tat1,1,1485,0.0,0.001347,0.000673,0.0,...,0.008081,0.003367,0.526599,0.0,0,0.001347,0.000673,0.452525,0,0.0
4,A0026,R09,0.583333,Tat1,1,334,0.0,0.002994,0.0,0.0,...,0.008982,0.0,0.58982,0.0,0,0.002994,0.0,0.389222,0,0.0


# Functions

In [5]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def reformat_section(df):
    '''Must be a dataframe containing only 1 position'''
    pos = str(int(list(df.AAPos)[0]))
    AAs = ['A','R','N','D','C','Q','E','G','H','I',
           'L','K','M','F','P','S','T','W','Y','V']
    newnames_dict = {}
    for item in AAs:
        newnames_dict[item]= pos+item
    df_renamed = df.rename(columns = newnames_dict)
    df_renamed.drop(['AAPos','Coverage'], axis=1, inplace=True)
    return df_renamed

# Reformat

In [9]:
tat1_length = 72

pos1_df = slice_position(DF, 1)
DF2 = reformat_section(pos1_df)
for i in range(2,tat1_length+1):
    pos_df = slice_position(DF, i)
    pos_df = reformat_section(pos_df)
    DF2 = pd.merge(DF2, pos_df,
               left_on=['Patient','Visit','GDS','Prot',],
               right_on=['Patient','Visit','GDS','Prot'],
               how='outer')
    
print DF2.shape
DF2.head()

(55, 1444)


Unnamed: 0,Patient,Visit,GDS,Prot,1A,1R,1N,1D,1C,1Q,...,72L,72K,72M,72F,72P,72S,72T,72W,72Y,72V
0,A0001,R09,0.583333,Tat1,5.1e-05,0.002157,5.1e-05,7.7e-05,0.00208,5.1e-05,...,,,,,,,,,,
1,A0010,R08,1.416667,Tat1,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,A0013,R09,0.0,Tat1,0.0,0.001661,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,A0019,R12,0.25,Tat1,0.0,0.001347,0.000673,0.0,0.000673,0.0,...,,,,,,,,,,
4,A0026,R09,0.583333,Tat1,0.0,0.002994,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [14]:
DF2.fillna(DF2.mean(), inplace=True)
DF2['GDS_Threshold'] = DF2['GDS'] > 0.5
DF2['GDS_Threshold'] = DF2['GDS_Threshold'].astype(int)
print DF2.shape
DF2.head()

(55, 1445)


Unnamed: 0,Patient,Visit,GDS,Prot,1A,1R,1N,1D,1C,1Q,...,72K,72M,72F,72P,72S,72T,72W,72Y,72V,GDS_Threshold
0,A0001,R09,0.583333,Tat1,5.1e-05,0.002157,5.1e-05,7.7e-05,0.00208,5.1e-05,...,0.049947,0.000117,0.000115,0.115569,0.00037,0.000166,0,5.7e-05,6.2e-05,1
1,A0010,R08,1.416667,Tat1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049947,0.000117,0.000115,0.115569,0.00037,0.000166,0,5.7e-05,6.2e-05,1
2,A0013,R09,0.0,Tat1,0.0,0.001661,0.0,0.0,0.0,0.0,...,0.049947,0.000117,0.000115,0.115569,0.00037,0.000166,0,5.7e-05,6.2e-05,0
3,A0019,R12,0.25,Tat1,0.0,0.001347,0.000673,0.0,0.000673,0.0,...,0.049947,0.000117,0.000115,0.115569,0.00037,0.000166,0,5.7e-05,6.2e-05,0
4,A0026,R09,0.583333,Tat1,0.0,0.002994,0.0,0.0,0.0,0.0,...,0.049947,0.000117,0.000115,0.115569,0.00037,0.000166,0,5.7e-05,6.2e-05,1


# Magnus Opus

In [15]:
y = np.ravel(DF2['GDS_Threshold'])
X = DF2.drop(['Patient','Visit','GDS','Prot','GDS_Threshold'], axis=1)
X = X.as_matrix()
print X.shape
print y.shape

(55, 1440)
(55,)


In [None]:
# This will be awesome

# It will use 5+ different classification algorithms
# It will use various test/train splits
# It will rank feature importance
# It will compare with clinical data