# Import requirements

In [1]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import dataframe

In [2]:
DF = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/complete_data.csv')
DF.head()

Unnamed: 0,Patient,Visit,GDS,Prot,AAPos,Coverage,A,R,N,D,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,0.583333,Tat1,1,38938,5.1e-05,0.002157,5.1e-05,7.7e-05,...,0.006934,0.001695,0.565052,2.6e-05,0,0.000848,0.000128,0.416637,0,0.000205
1,A0010,R08,1.416667,Tat1,1,171,0.0,0.0,0.0,0.0,...,0.005848,0.005848,0.584795,0.0,0,0.0,0.0,0.403509,0,0.0
2,A0013,R09,0.0,Tat1,1,602,0.0,0.001661,0.0,0.0,...,0.001661,0.004983,0.282392,0.0,0,0.001661,0.0,0.209302,0,0.0
3,A0019,R12,0.25,Tat1,1,1485,0.0,0.001347,0.000673,0.0,...,0.008081,0.003367,0.526599,0.0,0,0.001347,0.000673,0.452525,0,0.0
4,A0026,R09,0.583333,Tat1,1,334,0.0,0.002994,0.0,0.0,...,0.008982,0.0,0.58982,0.0,0,0.002994,0.0,0.389222,0,0.0


# Define functions

In [3]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def GDS_split(df, parameter, threshold):
    mask1 = df[parameter] < threshold
    mask2 = df[parameter] >= threshold
    df1 = df[mask1]
    df2 = df[mask2]
    return df1, df2

def identify_diffs(df, parameter, threshold):
    mask = df[parameter] > threshold
    new_df = df[mask]
    return new_df

# Data Exploration

In [7]:
tat1_length = 72
AAs = ['A','R','N','D','C','Q','E','G','H','I',
       'L','K','M','F','P','S','T','W','Y','V']

candidates_dict = {'Position':[], 'Variant':[], 'Non-impaired':[],
                   'Impaired':[], 'Difference':[], 'Log2FC':[]}

for i in range(1,tat1_length+1):
    pos_df = slice_position(DF, i)
    print i, pos_df.shape,
    df1, df2 = GDS_split(pos_df, 'GDS', 0.5)
    df1 = df1[AAs].copy()
    df2 = df2[AAs].copy()

    mean1 = df1.mean()
    mean2 = df2.mean()
    objs = [mean1, mean2]
    mean_df = pd.concat(objs, axis=1, join='outer')
    mean_df.rename(columns={0: 'Non-impaired', 1: 'Impaired'},inplace=True)
    mean_df['Difference'] = abs((mean_df['Non-impaired'] - mean_df['Impaired']))
    mean_df['Log2FC'] = np.log2(mean_df['Impaired'] / mean_df['Non-impaired'])
    
    filtered_df = identify_diffs(mean_df, 'Difference', 0.1)
    if filtered_df.shape[0] > 0:
        for idx, row in filtered_df.iterrows():
            candidates_dict['Position'].append(i)
            candidates_dict['Variant'].append(idx)
            candidates_dict['Non-impaired'].append(row['Non-impaired'])
            candidates_dict['Impaired'].append(row['Impaired'])
            candidates_dict['Difference'].append(row['Difference'])
            candidates_dict['Log2FC'].append(row['Log2FC'])

order = ['Position','Variant','Non-impaired','Impaired','Difference','Log2FC']
candidates_df = pd.DataFrame(candidates_dict)[order]
candidates_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/Tat1_candidates.csv')

candidates_df.head()

1 (54, 26) 2 (54, 26) 3 (55, 26) 4 (55, 26) 5 (55, 26) 6 (55, 26) 7 (55, 26) 8 (55, 26) 9 (55, 26) 10 (55, 26) 11 (54, 26) 12 (54, 26) 13 (54, 26) 14 (54, 26) 15 (54, 26) 16 (54, 26) 17 (54, 26) 18 (55, 26) 19 (55, 26) 20 (55, 26) 21 (55, 26) 22 (55, 26) 23 (55, 26) 24 (55, 26) 25 (55, 26) 26 (55, 26) 27 (55, 26) 28 (55, 26) 29 (55, 26) 30 (55, 26) 31 (55, 26) 32 (55, 26) 33 (55, 26) 34 (55, 26) 35 (55, 26) 36 (55, 26) 37 (55, 26) 38 (55, 26) 39 (55, 26) 40 (55, 26) 41 (55, 26) 42 (55, 26) 43 (55, 26) 44 (55, 26) 45 (55, 26) 46 (55, 26) 47 (55, 26) 48 (55, 26) 49 (55, 26) 50 (55, 26) 51 (55, 26) 52 (55, 26) 53 (55, 26) 54 (55, 26) 55 (55, 26) 56 (55, 26) 57 (55, 26) 58 (55, 26) 59 (53, 26) 60 (53, 26) 61 (53, 26) 62 (53, 26) 63 (53, 26) 64 (53, 26) 65 (54, 26) 66 (54, 26) 67 (54, 26) 68 (54, 26) 69 (55, 26) 70 (55, 26) 71 (55, 26) 72 (20, 26)

Unnamed: 0,Position,Variant,Non-impaired,Impaired,Difference,Log2FC
0,7,R,0.549994,0.7127,0.162706,0.373879
1,7,S,0.262935,0.068443,0.194492,-1.941727
2,24,N,0.291948,0.443861,0.151914,0.604398
3,24,P,0.243821,0.115902,0.127919,-1.072913
4,24,S,0.126076,0.004252,0.121823,-4.889861





# Reformat

In [56]:
def reformat_section(df):
    '''Must be a dataframe containing only 1 position'''
    pos = str(int(list(df.AAPos)[0]))
    AAs = ['A','R','N','D','C','Q','E','G','H','I',
           'L','K','M','F','P','S','T','W','Y','V']
    newnames_dict = {}
    for item in AAs:
        newnames_dict[item]= pos+item
    df_renamed = df.rename(columns = newnames_dict)
    df_renamed.drop(['AAPos','Coverage'], axis=1, inplace=True)
    return df_renamed

In [55]:
for i in range()

df7 = slice_position(DF,7)
df72 = slice_position(DF,72)
print df7.shape
print df72.shape

(55, 26)
(20, 26)


In [57]:
df7 = reformat_section(df7)
df72 = reformat_section(df72)

In [59]:
df7.head()

Unnamed: 0,Patient,Visit,GDS,Prot,7A,7R,7N,7D,7C,7Q,...,7L,7K,7M,7F,7P,7S,7T,7W,7Y,7V
328,A0001,R09,0.583333,Tat1,0.003026,0.45186,3.4e-05,0.002304,0.000825,0,...,0.000103,0.0,0,0,0,0.53803,0.00141,0,0,0
329,A0010,R08,1.416667,Tat1,0.0,0.983607,0.0,0.016393,0.0,0,...,0.0,0.0,0,0,0,0.0,0.0,0,0,0
330,A0013,R09,0.0,Tat1,0.0,0.0,0.012069,0.0,0.0,0,...,0.0,0.982759,0,0,0,0.001724,0.0,0,0,0
331,A0019,R12,0.25,Tat1,0.003941,0.007882,0.0,0.0,0.0,0,...,0.0,0.0,0,0,0,0.986207,0.0,0,0,0
332,A0026,R09,0.583333,Tat1,0.0,0.991304,0.0,0.004348,0.0,0,...,0.0,0.0,0,0,0,0.004348,0.0,0,0,0


In [60]:
mdf = pd.merge(df7, df72,
               left_on=['Patient','Visit','GDS','Prot',],
               right_on=['Patient','Visit','GDS','Prot'],
               how='outer')

print mdf.columns

mdf.head(10)

Index([u'Patient', u'Visit', u'GDS', u'Prot', u'7A', u'7R', u'7N', u'7D',
       u'7C', u'7Q', u'7E', u'7G', u'7H', u'7I', u'7L', u'7K', u'7M', u'7F',
       u'7P', u'7S', u'7T', u'7W', u'7Y', u'7V', u'72A', u'72R', u'72N',
       u'72D', u'72C', u'72Q', u'72E', u'72G', u'72H', u'72I', u'72L', u'72K',
       u'72M', u'72F', u'72P', u'72S', u'72T', u'72W', u'72Y', u'72V'],
      dtype='object')


Unnamed: 0,Patient,Visit,GDS,Prot,7A,7R,7N,7D,7C,7Q,...,72L,72K,72M,72F,72P,72S,72T,72W,72Y,72V
0,A0001,R09,0.583333,Tat1,0.003026,0.45186,3.4e-05,0.002304,0.000825,0,...,,,,,,,,,,
1,A0010,R08,1.416667,Tat1,0.0,0.983607,0.0,0.016393,0.0,0,...,,,,,,,,,,
2,A0013,R09,0.0,Tat1,0.0,0.0,0.012069,0.0,0.0,0,...,,,,,,,,,,
3,A0019,R12,0.25,Tat1,0.003941,0.007882,0.0,0.0,0.0,0,...,,,,,,,,,,
4,A0026,R09,0.583333,Tat1,0.0,0.991304,0.0,0.004348,0.0,0,...,,,,,,,,,,
5,A0034,R04,0.666667,Tat1,0.00018,0.986508,0.0,0.0,0.0,0,...,0.995909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,A0044,R10,1.777778,Tat1,0.0,0.002257,0.002257,0.0,0.0,0,...,,,,,,,,,,
7,A0045,R03,0.166667,Tat1,0.0,0.988157,0.0,0.002221,0.0,0,...,,,,,,,,,,
8,A0059,R08,0.25,Tat1,0.006672,0.003588,0.0,0.000112,0.001626,0,...,,,,,,,,,,
9,A0082,R06,0.583333,Tat1,0.000109,0.963533,0.000218,0.006878,0.000437,0,...,0.158788,0.001616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Linear Discriminant Analysis