# Import requirements

In [77]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import dataframe

In [3]:
# will need to change this to include all 101 positions
DF = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/complete_data.csv')
DF.tail()

Unnamed: 0,Patient,Visit,GDS,Prot,AAPos,Coverage,A,R,N,D,...,L,K,M,F,P,S,T,W,Y,V
3895,A0389,R07,0.0,Tat1,72,624,0.001603,0.00641,0.0,0,...,0.990385,0.0,0,0,0.0,0.0,0,0,0,0
3896,A0415,R04,1.333333,Tat1,72,124,0.0,0.008065,0.008065,0,...,0.024194,0.008065,0,0,0.016129,0.0,0,0,0,0
3897,A0421,R04,0.0,Tat1,72,608,0.0,0.034539,0.0,0,...,0.629934,0.0,0,0,0.006579,0.001645,0,0,0,0
3898,A0465,R03,0.25,Tat1,72,231,0.0,0.0,0.0,0,...,0.995671,0.0,0,0,0.0,0.0,0,0,0,0
3899,A0500,R03,0.5,Tat1,72,676,0.0,0.001479,0.0,0,...,0.97929,0.0,0,0,0.002959,0.0,0,0,0,0


# Define functions

In [4]:
# these functions are purely for data exploration

def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def GDS_split(df, parameter, threshold):
    mask1 = df[parameter] < threshold
    mask2 = df[parameter] >= threshold
    df1 = df[mask1]
    df2 = df[mask2]
    return df1, df2

def identify_diffs(df, parameter, threshold):
    mask = df[parameter] > threshold
    new_df = df[mask]
    return new_df

# Data Exploration

In [5]:
tat1_length = 72
AAs = ['A','R','N','D','C','Q','E','G','H','I',
       'L','K','M','F','P','S','T','W','Y','V']

candidates_dict = {'Position':[], 'Variant':[], 'Non-impaired':[],
                   'Impaired':[], 'Difference':[], 'Log2FC':[]}

for i in range(1,tat1_length+1):
    pos_df = slice_position(DF, i)
    print i, pos_df.shape,
    df1, df2 = GDS_split(pos_df, 'GDS', 0.5)
    df1 = df1[AAs].copy()
    df2 = df2[AAs].copy()

    mean1 = df1.mean()
    mean2 = df2.mean()
    objs = [mean1, mean2]
    mean_df = pd.concat(objs, axis=1, join='outer')
    mean_df.rename(columns={0: 'Non-impaired', 1: 'Impaired'},inplace=True)
    mean_df['Difference'] = abs((mean_df['Non-impaired'] - mean_df['Impaired']))
    mean_df['Log2FC'] = np.log2(mean_df['Impaired'] / mean_df['Non-impaired'])
    
    filtered_df = identify_diffs(mean_df, 'Difference', 0.1)
    if filtered_df.shape[0] > 0:
        for idx, row in filtered_df.iterrows():
            candidates_dict['Position'].append(i)
            candidates_dict['Variant'].append(idx)
            candidates_dict['Non-impaired'].append(row['Non-impaired'])
            candidates_dict['Impaired'].append(row['Impaired'])
            candidates_dict['Difference'].append(row['Difference'])
            candidates_dict['Log2FC'].append(row['Log2FC'])

order = ['Position','Variant','Non-impaired','Impaired','Difference','Log2FC']
candidates_df = pd.DataFrame(candidates_dict)[order]
candidates_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/Tat1_candidates.csv')

candidates_df

1 (54, 26) 2 (54, 26) 3 (55, 26) 4 (55, 26) 5 (55, 26) 6 (55, 26) 7 (55, 26) 8 (55, 26) 9 (55, 26) 10 (55, 26) 11 (54, 26) 12 (54, 26) 13 (54, 26) 14 (54, 26) 15 (54, 26) 16 (54, 26) 17 (54, 26) 18 (55, 26) 19 (55, 26) 20 (55, 26) 21 (55, 26) 22 (55, 26) 23 (55, 26) 24 (55, 26) 25 (55, 26) 26 (55, 26) 27 (55, 26) 28 (55, 26) 29 (55, 26) 30 (55, 26) 31 (55, 26) 32 (55, 26) 33 (55, 26) 34 (55, 26) 35 (55, 26) 36 (55, 26) 37 (55, 26) 38 (55, 26) 39 (55, 26) 40 (55, 26) 41 (55, 26) 42 (55, 26) 43 (55, 26) 44 (55, 26) 45 (55, 26) 46 (55, 26) 47 (55, 26) 48 (55, 26) 49 (55, 26) 50 (55, 26) 51 (55, 26) 52 (55, 26) 53 (55, 26) 54 (55, 26) 55 (55, 26) 56 (55, 26) 57 (55, 26) 58 (55, 26) 59 (53, 26) 60 (53, 26) 61 (53, 26) 62 (53, 26) 63 (53, 26) 64 (53, 26) 65 (54, 26) 66 (54, 26) 67 (54, 26) 68 (54, 26) 69 (55, 26) 70 (55, 26) 71 (55, 26) 72 (20, 26)

Unnamed: 0,Position,Variant,Non-impaired,Impaired,Difference,Log2FC
0,7,R,0.549994,0.7127,0.162706,0.373879
1,7,S,0.262935,0.068443,0.194492,-1.941727
2,24,N,0.291948,0.443861,0.151914,0.604398
3,24,P,0.243821,0.115902,0.127919,-1.072913
4,24,S,0.126076,0.004252,0.121823,-4.889861
5,53,R,0.728426,0.880454,0.152028,0.273466
6,58,A,0.6535,0.478622,0.174879,-0.449303
7,58,S,0.077773,0.217259,0.139486,1.482079
8,59,H,0.188689,0.002631,0.186059,-6.164529
9,59,P,0.6214,0.755294,0.133894,0.281515



