# Import Requirements

In [1]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Define functions

In [2]:
def tsvfile_to_df(tsv_file):
    patient = tsv_file.split('/')[-1].split('.')[0].split('-')[0]
    visit = tsv_file.split('/')[-1].split('.')[0].split('-')[1]
    df = pd.read_csv(tsv_file, delimiter='\t')
    df['AAPos'] = df['AAPos']+1
    df['Patient'] = patient
    df['Visit'] = visit
    order = ['Patient','Visit','Prot','AAPos','AA','Count']
    df = df[order]
    return df

def remove_gaps(df):
    nogaps_df = df[df['AA']!='*']
    return nogaps_df

def makeFreqDict(df):
    #intialize dictionary
    aminoacids = 'ARNDCQEGHILKMFPSTWYV'
    freq_dict = {'Patient':[], 'Visit':[], 'Prot':[], 'AAPos':[], 'Coverage':[],
                 'A':[],'R':[],'N':[],'D':[],'C':[],'Q':[],'E':[],'G':[],'H':[],'I':[],
                 'L':[],'K':[],'M':[],'F':[],'P':[],'S':[],'T':[],'W':[],'Y':[],'V':[],}

    for i, group in df.groupby('AAPos'):  
        #convert counts to percentages
        total = sum(group['Count'])
        group['Percent'] = group['Count']/total
        #print group
        #get values for each position
        patient = list(group['Patient'])[0]
        visit = list(group['Visit'])[0]
        orf = list(group['Prot'])[0]
        pos = list(group['AAPos'])[0]
        cov = total
        #fill up dictionary with values
        freq_dict['Patient'].append(patient)
        freq_dict['Visit'].append(visit)
        freq_dict['Prot'].append(orf)
        freq_dict['AAPos'].append(pos)
        freq_dict['Coverage'].append(cov)  
        #fill up dictionary with frequencies
        for aa in aminoacids:
            x = group[group['AA'] == aa]
            if x.shape[0] == 0:
                freq_dict[aa].append(0)
            else:
                freq_dict[aa].append(list(x['Percent'])[0])
    return freq_dict

def freqToDataframe(freq_dict):
    col_order = ['Patient','Visit','Prot','AAPos','Coverage',
             'A','R','N','D','C','Q','E','G','H','I',
             'L','K','M','F','P','S','T','W','Y','V']
    freq_df = pd.DataFrame(freq_dict)[col_order]
    return freq_df

In [8]:
def makeCountDict(df):
    #intialize dictionary
    aminoacids = 'ARNDCQEGHILKMFPSTWYV'
    freq_dict = {'Patient':[], 'Visit':[], 'Prot':[], 'AAPos':[], 'Coverage':[],
                 'A':[],'R':[],'N':[],'D':[],'C':[],'Q':[],'E':[],'G':[],'H':[],'I':[],
                 'L':[],'K':[],'M':[],'F':[],'P':[],'S':[],'T':[],'W':[],'Y':[],'V':[],}
    for i, group in df.groupby('AAPos'):
        total = sum(group['Count'])
        #get values for each position
        patient = list(group['Patient'])[0]
        visit = list(group['Visit'])[0]
        orf = list(group['Prot'])[0]
        pos = list(group['AAPos'])[0]
        cov = total
        #fill up dictionary with values
        freq_dict['Patient'].append(patient)
        freq_dict['Visit'].append(visit)
        freq_dict['Prot'].append(orf)
        freq_dict['AAPos'].append(pos)
        freq_dict['Coverage'].append(cov)  
        #fill up dictionary with frequencies
        for aa in aminoacids:
            x = group[group['AA'] == aa]
            if x.shape[0] == 0:
                freq_dict[aa].append(0)
            else:
                freq_dict[aa].append(list(x['Count'])[0])
    return freq_dict

# Process Tat 1

In [3]:
t = time.time()

tsv_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Counts/Tat1/*.tsv')

for tsv_file in tsv_files:
    
    out_file = tsv_file.replace('/Counts/','/Frequency/').replace('.AAcounts.tsv','.AAfreq.csv')
    df = tsvfile_to_df(tsv_file)
    nogap_df = remove_gaps(df)
    freq_dict = makeFreqDict(nogap_df)
    freq_df = freqToDataframe(freq_dict)
    freq_df.to_csv(out_file, index=False)
    
elapsed = time.time() - t
print elapsed

182.921114922


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Process Tat 2

In [None]:
t = time.time()

tsv_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Counts/Tat2/*.tsv')

for tsv_file in tsv_files:
    
    out_file = tsv_file.replace('/Counts/','/Frequency/').replace('.AAcounts.tsv','.AAfreq.csv')
    df = tsvfile_to_df(tsv_file)
    nogap_df = remove_gaps(df)
    freq_dict = makeFreqDict(nogap_df)
    freq_df = freqToDataframe(freq_dict)
    freq_df.to_csv(out_file, index=False)
    
elapsed = time.time() - t
print elapsed

# Diversity

In [9]:
t = time.time()
tsv_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Counts/Tat1/*.tsv')

for tsv_file in tsv_files:
    
    out_file = tsv_file.replace('/Counts/','/Diversity/').replace('.AAcounts.tsv','.AAfreqcount.csv')
    df = tsvfile_to_df(tsv_file)
    nogap_df = remove_gaps(df)
    count_dict = makeCountDict(nogap_df)
    count_df = freqToDataframe(count_dict)
    count_df.to_csv(out_file, index=False)
    
elapsed = time.time() - t
print elapsed

50.8182630539


In [10]:
t = time.time()
tsv_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Counts/Tat2/*.tsv')

for tsv_file in tsv_files:
    
    out_file = tsv_file.replace('/Counts/','/Diversity/').replace('.AAcounts.tsv','.AAfreqcount.csv')
    df = tsvfile_to_df(tsv_file)
    nogap_df = remove_gaps(df)
    count_dict = makeCountDict(nogap_df)
    count_df = freqToDataframe(count_dict)
    count_df.to_csv(out_file, index=False)
    
elapsed = time.time() - t
print elapsed

22.0378270149


# Consensus

In [None]:
freq_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Frequency/Tat1/*.csv')

cols = ['A','R','N','D','C','Q','E','G','H','I',
        'L','K','M','F','P','S','T','W','Y','V']

for freq_file in freq_files:
    df = pd.read_csv(freq_file)
    if df.shape[0] > 70:
        a = df[cols].idxmax(axis=1, skipna=True)
        print ''.join(list(a))

In [None]:
freq_files = glob.glob('/Users/greg/Desktop/FullNeuroIllumina/Frequency/Tat2/*.csv')

cols = ['A','R','N','D','C','Q','E','G','H','I',
        'L','K','M','F','P','S','T','W','Y','V']

for freq_file in freq_files:
    df = pd.read_csv(freq_file)
    if df.shape[0] > 12:
        a = df[cols].idxmax(axis=1, skipna=True)
        print ''.join(list(a))