In [394]:
# !/usr/bin/env python

# [Future imports]
# "print" function compatibility between Python 2.x and 3.x
from __future__ import print_function
# Use Python 3.x "/" for division in Pyhton 2.x
from __future__ import division

# General Python
import sys
import os
sys.path.append('./')
# Data structures
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mpl
dpi = 200
mpl.rc("savefig", dpi=dpi)
%matplotlib inline
from scipy.stats.stats import pearsonr
from scipy.stats.stats import spearmanr
from scipy import stats
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import seaborn as sns
import math
import re

from lmfit import minimize, Parameters, report_fit
from matplotlib.colors import LogNorm

#additional imports
import pandas as pd
from Bio import SeqIO, SeqRecord,Seq
import mkl_random
import random

import os
import subprocess
from Bio import Entrez, SeqIO
from copy import deepcopy
from pathlib import Path

import datetime
import time
import json
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import re


## 1. Define functions used in this notebook

In [395]:
#define a function to convert a base position within the ORF of a variant 
#to a codon position

def get_codon(pos):

    import math

    #subtract 1 to match index of genome list
    reindex_pos=pos-1
    
    #pass if start codon is not within ORF
    if reindex_pos < start_pos:
        return 'none'
    
    #else, subtract start position from position of interest and round upward
    else:
        codon = math.ceil((reindex_pos-start_pos+1)/3)
        return codon
    
    
#define a function to identify the position of the stop codon within a list of
#codons for a protein: the codon list begins with ATG at index 0
def get_stop_codon_pos(codon_list):
    for index, codon in enumerate(codon_list):
        if codon == 'TAA':
            stop_codon_pos= index #position of stop codon in codon list where item 0 = start codon ATG
            break
    return(stop_codon_pos)


#define a function to return amino acid if given a codon sequence (single amino acid)
def get_amino_acid(codon_seq):
    from Bio.Seq import Seq
    messenger_rna = Seq(codon_seq)
    translation=list(messenger_rna.translate())
    amino_acid=translation[0]
    return (amino_acid)

#define a function to translate any DNA sequence
def get_translation(codon_seq):
    from Bio.Seq import Seq
    messenger_rna = Seq(codon_seq)
    translation=list(messenger_rna.translate())
    return (translation)

#define function to return codon change of single base substitutions
def get_var_change(pos_lists, pos_identity, codon_pos):
    #codon pos of -1 means mutatation is located in genome at a position outside of ORF (defined in
    #get codon function)  
    if codon_pos[0]=="none":
        return 'NaN'
    else:
        codon_pos_val=codon_pos[0]-1

        #get codon start position (with respect to 0 indexed genome list)
        codon_start_pos=math.trunc(((codon_pos_val)*3)+start_pos)

        #this must be re-indexed for -1, ie, genome_seq_codon_list[0]=ATG
        WT_codon=''.join((genome_seq_list[codon_start_pos:(codon_start_pos+3)]))

        #get codon start position 
        #(with respect to 1 indexed variant position information from vcf/dataframe)
        codon_start_pos_vcf_index=codon_start_pos+1

        #get WT amino acid
        WT_amino_acid=get_amino_acid(WT_codon)


        #assign position index to each position in WT codon using dictionary
        WT_codonDict={}
        WT_codonDict[codon_start_pos_vcf_index]=WT_codon[0]
        WT_codonDict[codon_start_pos_vcf_index+1]=WT_codon[1]
        WT_codonDict[codon_start_pos_vcf_index+2]=WT_codon[2]

        #create dictionary for substitutions in codon
        #zip function accepts list of keys and list of values to do this
        var_codonDict= dict(zip(pos_lists, pos_identity))

        #now, create modified dictionary containing variant codon
        #which may include 1 or 2 bases from the WT codon

        WT_codonDict.update(var_codonDict)

        #Extract full variant codon in string format in order to translate to amino acid

        #create string to store full variant codon
        var_codon=""
        for key, value in WT_codonDict.items():
            var_codon = var_codon+value

        #translate variant codon to amino acid residue
        var_amino_acid=get_amino_acid(var_codon)

        #create string summarizing amino acid mutation
        var_change=WT_amino_acid+"_"+str(codon_pos[0])+"_"+var_amino_acid


        #return this variable to store to 'newseq' in get_amino_acid_change
        return var_change

#define function to get amino acid change (could probably do this function and above in one instead)
#input dataframe for this function is the "barcode_codon_index" grouped dataframe
#therefore, only substitions within the same codon will be processed together
def get_amino_acid_change(df):
    
    newdf = df.copy()
    
    #get values from relevant columns
    codon_pos=df.codon.values
    pos_lists=list(df.position.values)
    pos_identity=list(df.altSeq.values)
    
    #call get_var_change function
    newseq = get_var_change(pos_lists, pos_identity, codon_pos)
    newdf['amino_acid_sub'] = newseq
    #return dataframe with additional column
    return newdf

## 2. Define variant call directories and import corresponding genome fasta

In [396]:
##directory for variant call files (purexpress entire genome)
dirName_1to13_purexp_cov = '/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/variant_calls/20201222_SU011_MJA_1-13_with_R1_R2_variant_calls'
dirName_14_purexp_cov = '/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/variant_calls/20201219_SU012_MJA_14_with_R1_R2_variant_calls'

#generate list of files in directory
file_1to13_purexp = os.listdir(dirName_1to13_purexp_cov)
file_14_purexp = os.listdir(dirName_14_purexp_cov)

In [397]:
#import genome (a fasta file with header line and seq line)
genome_file=open('/Users/mja/Appel_MutSequencing/snakemake_files/genomes/spap_with_egfp/spap_with_egfp.fa',"r")

#process file into a list of lines
genome_lines=genome_file.readlines()

#pull out sequence line as string
genome_seq=str(genome_lines[1])

#conver string to list for reference
genome_seq_list=list(genome_seq)

In [398]:
#generate a 'genome' listed in codon format

#use this loop to identify absolute base position of first base in start codon for this genome
for idx, val in enumerate(genome_seq_list):
    start_str="ATG"
    
    #create string of base and the following two bases
    codon_str= str(genome_seq_list[idx])+str(genome_seq_list[idx+1])+str(genome_seq_list[idx+2])
    
    #check if codon above matches start codon, if so break loop, this is first ATG
    if start_str == codon_str:
        
        #store position of first base in start codon
        start_pos = idx
        break
    
#select subset of genome file residing in coding region (here, reference genome
#spans 5'UTR, all of SpAP ORF, and some bases beyond SpAP ORF)    
genome_seq_coding_list=genome_seq_list.copy()

#copy from position of first base in start codon to end
genome_seq_coding_list=genome_seq_coding_list[start_pos:len(genome_seq_coding_list)-1]


#convert coding list back to a string for splitting into triplets
genome_seq_coding_str=''.join(genome_seq_coding_list)

#use regex to split string into list of every 3 characters
genome_seq_codon_list=re.findall('...', genome_seq_coding_str)

## 3. Process entire directory of vcf files and ouput dictionary with variant data

In [399]:
# let's make a dictionary for data and read in the data from the files
dirName=dirName_1to13_purexp_cov
fileL=file_1to13_purexp

dataD = {}

for a in fileL:
    #get file name without extension
    name=a.split('.')
     
    #pull out i7+i5
    i7=(name[0].split('-'))[3]
   
    i5=((name[0].split('-'))[7]).split('_')[0]
    
    # let's create a dictionary entry for that barcode name (i7)
    bcName=int(i7)
    
    dataD[bcName] = {}
    
    # open each file
    f = open(dirName + '/'+a,'r')
    
    #read it line by line
    for line in f:
        # ignore it if it starts with a #
        if line[0] == '#':
            pass
        else:
            # create a tab-delimited list of things
            tempL = line.split('\t')
            # get the position number of a potential variant
            posNum = int(tempL[1])
            # get the reference sequence at that position
            refSeq = tempL[3]
            # get the variant sequence at that position
            altSeq = tempL[4]
            # get the filter score
            filterScore = float(tempL[5])
            
            
            #get codon number from reference sequence
            codonNum=get_codon(posNum)
            
            #create tuple to uniquely identify variants even if they share the same position
            pos_unique=(posNum, refSeq, altSeq)
            
            
            # find the entry that has the DP4 stuff with FW and RV reference and alternate read counts
            # this list has lots of stuff at variable spots so be careful!
            varStats = tempL[7].split(';')
            for b in varStats:
                # identify the thing in the list that looks like DP4 = 
                if 'DP4' in b:
                    # now get the reads
                    fw_WT = int(b.split('=')[1].split(',')[0])
                    rv_WT = int(b.split('=')[1].split(',')[1])
                    fw_alt = int(b.split('=')[1].split(',')[2])
                    rv_alt = int(b.split('=')[1].split(',')[3])
            # now add to nested dictionary                                                                        
            dataD[bcName][pos_unique] = {}
            dataD[bcName][pos_unique]['refSeq'] = refSeq
            dataD[bcName][pos_unique]['altSeq'] = altSeq
            dataD[bcName][pos_unique]['filter'] = filterScore
            dataD[bcName][pos_unique]['fw_WT'] = fw_WT
            dataD[bcName][pos_unique]['rv_WT'] = rv_WT
            dataD[bcName][pos_unique]['fw_alt'] = fw_alt
            dataD[bcName][pos_unique]['rv_alt'] = rv_alt
            dataD[bcName][pos_unique]['codon']=codonNum
            dataD[bcName][pos_unique]['position']=posNum
            dataD[bcName][pos_unique]['i7_barcode']=bcName
    
    #close file
    f.close()
    
len(dataD)

4646

## 4. Data filtering step: identify all i7 barcodes containing 10X read depth at >= 95% of positions (and at least 1 read at all positions)
-Coverage stat dataframes contain this information, import and make a list of barcodes failing to meet this depth threshold  
-In next step, converting dictionary to dataframe, remove these barcodes

In [400]:
#import csv containing depth stat information (from notebook 'II_Coverage_Stats')
coverageDF = pd.read_csv('/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/analyses_from_py_notebooks/with_unpaired/depth_results/20210602_plates_1-13_read_stats_allgenomes_cov_stats.csv')
coverageDF.drop(columns=['Unnamed: 0'], inplace=True)

#convert nan to zeros
coverageDF.fillna(0, inplace=True)

#identify entries with less than 10x depth at 95+% of positions
barcodesToKeepDF=coverageDF.loc[(coverageDF['frac_pos_cov>=1'] == 1.00) & (coverageDF['frac_pos_cov>=10'] > 0.94)]

barcodesKeepL=barcodesToKeepDF['i7'].tolist()

len(barcodesKeepL)

3530

## 5. Convert dictionary (for barcodes containing variants only) to dataframe
-Remove all barcodes not in keep list from step 4  
-after that, find empty vcf files, these are WT  
-Results 6/3/21:  
-3530 meeting depth threshold, 327 are WT, 3203 have variants (of any type)  
-Of 3203, 174 barcodes have indels (at least 1 indel, and any other type of sub), and 3029 have only single base substitutions (but could be single, double, triple or higher mutants)

In [401]:
#convert dictionary into dataframe for downstream steps
dataframeflatdict = []
BCs_with_empty_vcf=[]
skipped=[]

#create flat dictionary from nested dictionary above
for k, v in dataD.items():

    if k in barcodesKeepL:
        #identify barcodes with no entries (vcf files empty)
        #if v: evaluates to 'True' then pass(there are variants here)
        if v:
            pass

        #if no variants, add barcode to list to reference later
        else:
            BCs_with_empty_vcf.append(k)

        for entrykey, entryvalue in v.items():
            dataframeflatdict.append(entryvalue) # entryvalue is a dictionary, the innermost
    else:
        skipped.append(k)
        
#make new dataframe from flattened dictionary
dataDF=pd.DataFrame(dataframeflatdict)

#add "indel flag" if variant is not the same length as reference sequence
dataDF['indel_flag'] = (dataDF.altSeq.str.len() != 1) | (dataDF.refSeq.str.len() != 1)
dataDF.set_index(['indel_flag','codon'], inplace = True)
dataDF['i7_barcode'].nunique()

dataDF.loc[True]

Unnamed: 0_level_0,refSeq,altSeq,filter,fw_WT,rv_WT,fw_alt,rv_alt,position,i7_barcode
codon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
168,CT,CTT,43.9865,6,5,2,4,576,3666
582,CGGCGAGGGCGAGGGCGA,CGGCGAGGGCGA,221.0000,2,0,4,3,1818,1142
267,AG,A,223.0000,0,0,14,6,872,4324
490,AGG,AG,225.0000,0,0,29,29,1542,6938
44,ATT,AT,225.0000,0,0,16,7,203,1242
...,...,...,...,...,...,...,...,...,...
486,GTTTT,GTTT,225.0000,0,0,69,82,1530,4412
357,TGA,T,225.0000,0,0,42,25,1142,6025
203,CG,C,225.0000,0,0,15,14,680,3941
11,CCCTGCGGCACGTTCCATCGCAGCTACGCCTCCTAAACTGATCGTG...,C,30.4183,0,0,1,0,105,860


#### 5.1 Export DFs containing barcodes tossed for failing depth thresholds, and WT barcodes

In [402]:
#add list of skipped barcodes to dataframe and export for later reference
#these are barcodes NOT meeting depth thresholds -- cannot be categorized as WT or variant
(pd.DataFrame({'i7_barcode':skipped})).to_csv('/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/analyses_from_py_notebooks/with_unpaired/mutant_calling/csv_files/20210603_plates_1-13_barcodes_depth_failed_'+str(len(skipped))+'.csv')

#add list of WT barcodes to dataframe and export for later reference
#these are barcodes meeting depth thresholds but having empty vcf files
(pd.DataFrame({'i7_barcode':BCs_with_empty_vcf})).to_csv('/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/analyses_from_py_notebooks/with_unpaired/mutant_calling/csv_files/20210603_plates_1-13_barcodes_WT_'+str(len(BCs_with_empty_vcf))+'.csv')

## 6. Separate indel variants from single base substitutions
Here, want to:
1) calculate quantity of BCs containing indels and store list of these BCs
2) Select entries only containing non-indels
3) Remove any additional BCs that contained non-indels but also indel entries from step 1

In [403]:
#get list of BCs containing indels
indelBCsL = dataDF.loc[True].i7_barcode.values.tolist()

#also store indels in own dataframe
indels=dataDF.loc[True]

# get DF containing only non indel substitutions
dataDF_non_indels=dataDF.loc[False] 

#select only entries with barcodes that are not in the list of BCs containing at least one indel
substitutions=dataDF_non_indels.loc[~dataDF_non_indels.i7_barcode.isin(indelBCsL)] 

#change index for substitutions dataframe
substitutions.reset_index(drop = False, inplace =True)
substitutions['barcode_codon_index'] = substitutions['i7_barcode'].astype(str) + "_" +substitutions['codon'].astype(str)

#get unique number of i7 barcodes
substitutions['i7_barcode'].nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  substitutions['barcode_codon_index'] = substitutions['i7_barcode'].astype(str) + "_" +substitutions['codon'].astype(str)


3029

## 7. Get amino acid change for single base substitution

In [404]:
#cast codon column as object (as opposed to int) if needed here
substitutions=substitutions.astype({'codon': 'object'})
substitutions.dtypes

#ADD AMINO ACID SUB VALUE TO DATAFRAME WITH SINGLE BASE SUBSTITUTIONS ONLY
df_with_subs=substitutions.groupby(['barcode_codon_index']).apply(get_amino_acid_change)
df_with_subs.loc[df_with_subs['i7_barcode']==774]

Unnamed: 0_level_0,Unnamed: 1_level_0,codon,refSeq,altSeq,filter,fw_WT,rv_WT,fw_alt,rv_alt,position,i7_barcode,barcode_codon_index,amino_acid_sub
barcode_codon_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
774_24,5048,24,C,G,225.0,0,0,403,257,142,774,774_24,L_24_V
774_24,5049,24,G,T,225.0,0,0,400,256,144,774,774_24,L_24_V


## 8. Add column containing ratio of variant:WT reads for each codon sub  
EDIT: average the (F+R) sum of variant and WT reads across all nt subtitutions for a given codon sub 
-This will preserve the effects of having slightly different read counts for nt subs withing the same AA sub 
-Add columns containing sums of F and R reads for WT and var  
-Add column with ratio of var:WT

In [405]:
#copy DF from step 7
df_with_subs_ratio=df_with_subs.copy()

aveVarSumL=[]
aveWTSumL=[]

for index, val in df_with_subs_ratio.iterrows():
    #average FWD and REV read counts (for variant and WT sequences) at variant position, then sum
    var_sum_ave=int((sum((df_with_subs.loc[index[0]])['fw_alt'].tolist())/2) + (sum((df_with_subs.loc[index[0]])['rv_alt'].tolist())/2))
    WT_sum_ave=int((sum((df_with_subs.loc[index[0]])['fw_WT'].tolist())/2) + (sum((df_with_subs.loc[index[0]])['rv_WT'].tolist())/2))
    
    #account for round down averages that produce variant sum averages that =0
    if var_sum_ave == 0:
        var_sum_ave= 1
    else:
        pass
    
    #add to lists
    aveVarSumL.append(var_sum_ave)
    aveWTSumL.append(WT_sum_ave)
    

#add averaged sum columns to DF
df_with_subs_ratio['WT_sum']=aveWTSumL
df_with_subs_ratio['var_sum']=aveVarSumL

#add column containing ratio
df_with_subs_ratio['ratio_var']=round((df_with_subs_ratio['var_sum']/df_with_subs_ratio['WT_sum']),2)


df_with_subs_ratio.loc[df_with_subs_ratio['i7_barcode']==6135]

Unnamed: 0_level_0,Unnamed: 1_level_0,codon,refSeq,altSeq,filter,fw_WT,rv_WT,fw_alt,rv_alt,position,i7_barcode,barcode_codon_index,amino_acid_sub,WT_sum,var_sum,ratio_var
barcode_codon_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6135_384,1640,384,A,C,7.30814,0,0,1,0,1223,6135,6135_384,Y_384_S,0,1,inf


## 9. Drop multiple positions changes listed for each barcode, just want codon change here

In [406]:
#for codon substitutions originating base changes at more than one position, drop
#additional positions
df_with_subs_culled_BC_C_index = df_with_subs_ratio.drop_duplicates(subset=['barcode_codon_index'])


df_with_subs_culled_BC_C_index.loc[df_with_subs_culled_BC_C_index['i7_barcode']==774]

Unnamed: 0_level_0,Unnamed: 1_level_0,codon,refSeq,altSeq,filter,fw_WT,rv_WT,fw_alt,rv_alt,position,i7_barcode,barcode_codon_index,amino_acid_sub,WT_sum,var_sum,ratio_var
barcode_codon_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
774_24,5048,24,C,G,225.0,0,0,403,257,142,774,774_24,L_24_V,0,658,inf


## Add plate and well information to each entry

In [387]:
# ##import csv containing i7/i5 barcodes, experimental plate number, and wells
# ##plates 1-13
# barcode_key1to13=pd.read_csv('/Users/mja/Appel_MutSequencing/Barcodes/20201212_MJA_1-13_plate_well_key.csv')
# ##plates 14
# # barcode_key14=pd.read_csv('/Users/mja/Appel_MutSequencing/Barcodes/20201212_MJA_14_plate_well_key.csv')

# #select only plates 1-13 (since 14 contains duplicate barcodes and was sequenced separately)
# plates1to13_barcodes=barcode_key1to13.loc[barcode_key1to13['plate'].isin(range(1,14))]
# plates1to13_barcodes_i7_only=plates1to13_barcodes[['i7','plate','well']].copy()

# #select barcodes for plate 14 only
# barcodes14=barcode_key14.loc[barcode_key14['plate'] == 14]
# plates14_barcodes_i7_only=barcodes14[['i7','plate','well']].copy()

# ##merge plate1-13 barcode list with summary dataframe
# df_with_subs_culled_BC_C_index = pd.merge(left=df_with_subs_culled_BC_C_index, right=plates1to13_barcodes_i7_only, left_on='i7_barcode', right_on='i7')

# ##repeat for plate 14
# # plate14_df_wells = pd.merge(left=plate14_df, right=plates14_barcodes_i7_only, left_on='i7', right_on='i7')

# df_with_subs_culled_BC_C_index

## 10. Add plate and well data to aggregated Plates 1-13 dataframes, export, then split into plates and export

In [407]:
#import csv containing i7/i5 barcodes, experimental plate number, and wells
barcode_key=pd.read_csv('/Users/mja/Appel_MutSequencing/Barcodes/20201212_MJA_1-13_plate_well_key.csv')

#select only plates 1-6 (since 7 contains duplicate barcodes and was sequenced separately)
plates=range(1,14)
barcodes=barcode_key.loc[barcode_key['plate'].isin(plates)]
barcodes_i7_only=barcodes[['i7','plate','well']].copy()

#edit column name to match column name for barcodes used in dataframes within this notebook
barcodes_i7_only.rename(columns={"i7": "i7_barcode"}, inplace=True)

#list of dictionaries to add plate and well information to
list_of_summary_DFs=[dataDF,indels,df_with_subs_culled_BC_C_index]

#specify list of names to use as keys
list_of_df_names=["mutant_calling_initial_dataframe_all_vars",
                  "mutant_calling_indels",
                  "mutant_calling_all_AA_subs"]

#create dictionary for substitutions in codon
#zip function accepts list of keys and list of values to do this
df_dict= dict(zip(list_of_df_names, list_of_summary_DFs))


#make empty dict to add plate+well modified dataframes to
df_dict_plate_well={}

#make another empty dict to store dfs split by plate
split_df_dict_plate_well={}


#iterate through dictionary of dataframes
for df_name, df in df_dict.items():
    
    #make new dataframe to store merged df
    df_with_plate_well=pd.DataFrame()
    
    #merge on i7_barcode
    df_with_plate_well= pd.merge(left=df, right=barcodes_i7_only, left_on='i7_barcode', right_on='i7_barcode')
    #add df to dict
    df_dict_plate_well[df_name]=df_with_plate_well
    
    #export plate aggregated DF
    fullDF_P = Path('/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/analyses_from_py_notebooks/with_unpaired/mutant_calling/csv_files/')
    #export dataframe to csv using dynamic naming to specify plate
    df_with_plate_well.to_csv(Path(fullDF_P, "plates_1-13_"+df_name+'.csv'))
    
    
    #for each plate # in plate list (specified at top), create new dataframe containing rows with only that plate number
    for i in plates:
        split_df=df_with_plate_well.loc[df_with_plate_well['plate'] == i]
        
        #create key name to add this new dataframe to a dictionary to access later if needed
        df_name_with_plate="plate_"+str(i)+"_"+df_name
        
        #add to dictionary
        split_df_dict_plate_well[df_name_with_plate]=split_df
        
        from pathlib import Path
        
        #specify output directory
        p = Path('/Users/mja/Appel_MutSequencing/2021_mut_seq_workup_II/analyses_from_py_notebooks/with_unpaired/mutant_calling/csv_files/plate-well/')
        #export dataframe to csv using dynamic naming to specify plate
        split_df.to_csv(Path(p, df_name_with_plate+'.csv'))