In [1]:
import os
import random
from collections import defaultdict
import numpy as np
import copy
import multiprocessing

In [2]:
os.environ['PATH'] = '/opt/anaconda3/bin:/opt/anaconda3/condabin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'

In [3]:
import pickle

In [4]:
#load in dictionary of normal variation with allele freq > .001

os.chdir('/Users/beth/Documents/SNV project/')
with open('gnomad_dictionary_2.pickle','rb') as read_file:
    normal_var_dict=pickle.load(read_file)


In [5]:
#create the functions for base change, insertion, deletion
def update_sequence(changes_list, seq, offset, start, chrom, current_organ):
    
    #if list doesn't contain any changes just return the same seq and offset
    if len(changes_list) == 1 and changes_list[0] == "skip":
        return seq, offset
            
    
    else:
        #loop over changes list
        for i in range(len(changes_list)):
            
            #do nothing if it is a skip
            if changes_list[i] == "skip":
                continue

            #if it's a base change do this:
            if changes_list[i][2] != "-" and \
            changes_list[i][4] != "-" and \
            changes_list[i][3] != "-" and \
            ((len(changes_list[i][2]) == len(changes_list[i][3]) and isinstance(changes_list[i][4],float)==True) or\
            (len(changes_list[i][2]) == len(changes_list[i][3])) and \
            (len(changes_list[i][2]) == len(str(changes_list[i][4])) and isinstance(changes_list[i][4],float) == False)) :
                
                #print('base changes')
                if changes_list[i][2]!=changes_list[i][3]:
                    new_base = changes_list[i][3]
                else:
                    new_base = changes_list[i][4]
                ref_pos = int(changes_list[i][1]) -1
                adjust = int(offset[ref_pos - start])

                seq = seq[0:ref_pos + adjust - start] + new_base + seq[ref_pos + adjust - start + len(new_base) -1 + 1: ]

            #if it's an insertion do this:
            elif changes_list[i][2] == "-" or ( len(changes_list[i][2]) < len(str(changes_list[i][4])) \
                                               and isinstance(changes_list[i][4],float) == False):
                
                #print('insertion')
                if changes_list[i][2]!=changes_list[i][3]:
                    new_base = changes_list[i][3]
                else:
                    new_base = changes_list[i][4]
    
                    
                ref_pos = int(changes_list[i][1]) -1
                adjust = int(offset[ref_pos - start])
                
                if changes_list[i][2] == "-":
                    expected_base = ""
                    genie_fix=0
                else:
                    expected_base = changes_list[i][2]
                    genie_fix = 1

                seq = seq[0:ref_pos + adjust - start +1 - genie_fix] + new_base + seq[ref_pos + adjust - start +1 : 100-len(new_base) +genie_fix]

                if len(seq) > 100:
                    seq = seq[:100]
            
                offset= np.add(offset,  np.concatenate((np.zeros(ref_pos-start+1-genie_fix),np.repeat(len(new_base)-len(expected_base), start+100-ref_pos-1+genie_fix))))
                
            #if it's a deletion do this:
            elif changes_list[i][4]=="-" or changes_list[i][3]=="-" or \
            (len(str(changes_list[i][4])) < len(changes_list[i][2]) and \
             isinstance(changes_list[i][4],float) == False) :
                #print('deletion')
                
                if changes_list[i][4] != "-" and changes_list[i][3] != "-":
                    new_base = changes_list[i][4]
                    genie_fix = 1
                else:
                    new_base = ""
                    genie_fix = 0
                
                ref_pos = int(changes_list[i][1]) -1
                adjust = int(offset[ref_pos - start])
                expected_base = changes_list[i][2]
                

                seq = seq[0:ref_pos + adjust - start] + new_base + seq[ref_pos + adjust -start + len(expected_base) : 100] 
                
                if len(seq) < 100:
                    current_end = len(seq) + start - int(offset[len(seq)]) -1 
                    
                    with open('temp_location_{}.bed'.format(current_organ), 'w') as file:
                        file.write(chrom + "\t" + str(current_end) + "\t" + str(current_end+len(expected_base)-len(new_base)))
                    temp = 'temp_location_{}.bed'.format(current_organ)
                    #lock.acquire()
                    additional_sequence = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed $temp
                    #lock.release()
                    seq = seq + additional_sequence[1]
                    !rm $temp

                offset=np.add(offset, np.concatenate((np.zeros(ref_pos-start),np.repeat(-(len(expected_base)-len(new_base)), start+100-ref_pos))))

                
                
        return seq, offset
            

In [6]:
#get the samples list from the TCGA MAF file - here liver cancer
os.chdir("/Users/beth/Desktop/MetisProject5data")
TCGA_file_liver = "TCGA.LIHC.mutect.a630f0a0-39b3-4aab-8181-89c1dde8d3e2.DR-10.0.somatic.maf"

samples_liver = !tail -n +7 $TCGA_file_liver | cut -f16 | sort | uniq



In [10]:
def generate_sequences(TCGA_file,samples,current_organ,normal_var_dict):
    #looping over each sample
    for i in range(95,len(samples)):
        current_sample = samples[i]
        sample_mutations = !grep $current_sample $TCGA_file | tail -n +2 | cut -f5,6,11,12,13
        sample_mutations = list(map(lambda x: tuple(x.split('\t')), sample_mutations))
        sample_mutations_dict = defaultdict(list)

        #creating sample_mutations_dict
        for i in range(len(sample_mutations)):
            sample_mutations_dict[sample_mutations[i][0]].append([int(sample_mutations[i][1]), sample_mutations[i][2], sample_mutations[i][3], sample_mutations[i][4]])

        window_starts=[]
        skip_list=[]
        #looping over each mutation in sample_mutations list of tuples 
        for i in range(len(sample_mutations)):
            current_location = sample_mutations[i]

            #to get windows of 100bp around it (note: hg38 is one position behind both MAF and VCF)
            len_mutation = max(len(current_location[2]),len(current_location[4]))
            if len_mutation > 45:
                skip_list.append(current_location)
                window_starts.append('skip')
                continue
            for j in range(1):
                window = range(int(current_location[1])-100+len_mutation,int(current_location[1])-len_mutation)
                start = random.choice(list(window))
                end = start + 100
                chrom = current_location[0]
                window_starts.append(start)
                with open("sample_locations_{}.tmp".format(current_organ), "a") as file:
                    file.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n")
        
        locations = "sample_locations_{}.tmp".format(current_organ)
        #getting the hg38 sequences for the windows     
        #lock.acquire()
        ref_seqs = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed $locations
        #lock.release()

        #loop once for each window
        for i in range(1,len(ref_seqs),2):
            #getting the seq
            current_location=sample_mutations[i//2]
            if current_location in skip_list:
                continue
            ref_seq= ref_seqs[i]
            chrom = current_location[0]
            seq=copy.copy(ref_seq)
            start=window_starts[i//2]

            #getting a list of tuples for each mutation within the window
            nearby_mutations=[current_location]
            for i in range(len(sample_mutations_dict[chrom])):
                x = int(sample_mutations_dict[chrom][i][0])
                if x > int(start) and x < (int(start) + 100) and x != int(current_location[1]):
                    nearby_mutations.append(sample_mutations_dict[chrom][i])

            #add chrom names to mutation lists
            for i in range(1,len(nearby_mutations)):
                nearby_mutations[i] = [chrom]+nearby_mutations[i]

            #getting a list of tuples for possible normal variation within the window
            nearby_variation=[]
            for i in range(len(normal_var_dict[chrom])):
                x = int(normal_var_dict[chrom][i][0])
                if x > start and x < (start + 100):
                    nearby_variation.append(normal_var_dict[chrom][i])

            #add chrom names to normal variation lists
            for i in range(len(nearby_variation)):
                nearby_variation[i] = [chrom]+nearby_variation[i]

            #selecting which normal variation to include for the window
            nearby_variation_random=[] 
            for i in range(len(nearby_variation)):
                freq = nearby_variation[i][4]
                if freq > 0.5:
                    included = random.choices([nearby_variation[i], "skip"], weights=[freq, 1-freq], k=1)
                else:
                    included = random.choices([nearby_variation[i], "skip"], weights=[0.5, 0.5], k=1)
                nearby_variation_random.append(included[0])

            #make changes and append sequence to files

            #the offset keeps track of effects of indels
            offset=np.zeros(100).astype(int)

            #add in the nearby normal variation
            for i in range(len(nearby_variation_random)):
                seq, offset = update_sequence(nearby_variation_random, seq, offset, start, chrom, current_organ)

            #if a big insertion leads to other variation outside the 100bp window those will 
            #append to the end and can be removed
            if len(seq)>100:
                seq=seq[:100]

            with open('{}_normal.csv'.format(current_organ), 'a') as file:
                file.write(current_sample + "," + seq + "," + ref_seq + "\n")

            #add in the mutations on top of normal variation
            for i in range(len(nearby_mutations)):
                seq, offset = update_sequence(nearby_mutations, seq, offset, start, chrom, current_organ)

            #if big insertion made a mutation was outside of the 100bp window it will have
            #appeneded to the end. So there may be no mutation in the 100bp window and it should
            #be discarded if over 100bp
            if len(seq)==100:
                with open('{}_tumor.csv'.format(current_organ), 'a') as file:
                    file.write(current_sample + "," + seq + "," + ref_seq + "\n")
        
        #clear the sample locations file
        
        !rm $locations





In [7]:
os.chdir("/Users/beth/Desktop/MetisProject5data/GENIE")

In [8]:
#gene data requires a different function
genie_file = "genie_maf_processed.maf"

In [19]:
import re

In [24]:
def generate_sequences_genie(genie_file,samples,normal_var_dict):
    #looping over each sample
    for i in range(len(samples)):
        current_sample = samples[i][0]
        current_organ = samples[i][1]
        if "/" in current_organ:
            current_organ = re.search("^([A-Z]*)/", current_organ)[1]
        #sample_mutations = !grep $current_sample $TCGA_file | tail -n +2 | cut -f5,6,11,12,13
        sample_mutations = !grep $current_sample $genie_file | cut -f2,3,4,5,6 -d ","
        #sample_mutations = list(map(lambda x: tuple(x.split('\t')), sample_mutations))
        sample_mutations = list(map(lambda x: tuple(x.split(',')), sample_mutations))
        sample_mutations_dict = defaultdict(list)

        #creating sample_mutations_dict
        for i in range(len(sample_mutations)):
            sample_mutations_dict[sample_mutations[i][0]].append([int(sample_mutations[i][1]), sample_mutations[i][2], sample_mutations[i][3], sample_mutations[i][4]])

        window_starts=[]
        skip_list=[]
        #looping over each mutation in sample_mutations list of tuples 
        for i in range(len(sample_mutations)):
            current_location = sample_mutations[i]

            #to get windows of 100bp around it (note: hg38 is one position behind both MAF and VCF)
            len_mutation = max(len(current_location[2]),len(current_location[4]))
            if len_mutation > 45:
                skip_list.append(current_location)
                window_starts.append('skip')
                continue
            for j in range(1):
                window = range(int(current_location[1])-100+len_mutation,int(current_location[1])-len_mutation)
                start = random.choice(list(window))
                end = start + 100
                chrom = current_location[0]
                window_starts.append(start)
                with open("sample_locations_{}.tmp".format(current_organ), "a") as file:
                    file.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n")
        
        locations = "sample_locations_{}.tmp".format(current_organ)
        #getting the hg38 sequences for the windows     
        #lock.acquire()
        ref_seqs = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed $locations
        #lock.release()

        #loop once for each window
        for i in range(1,len(ref_seqs),2):
            #getting the seq
            current_location=sample_mutations[i//2]
            if current_location in skip_list:
                continue
            ref_seq= ref_seqs[i]
            chrom = current_location[0]
            seq=copy.copy(ref_seq)
            start=window_starts[i//2]

            #getting a list of tuples for each mutation within the window
            nearby_mutations=[current_location]
            for i in range(len(sample_mutations_dict[chrom])):
                x = int(sample_mutations_dict[chrom][i][0])
                if x > int(start) and x < (int(start) + 100) and x != int(current_location[1]):
                    nearby_mutations.append(sample_mutations_dict[chrom][i])

            #add chrom names to mutation lists
            for i in range(1,len(nearby_mutations)):
                nearby_mutations[i] = [chrom]+nearby_mutations[i]

            #getting a list of tuples for possible normal variation within the window
            nearby_variation=[]
            for i in range(len(normal_var_dict[chrom])):
                x = int(normal_var_dict[chrom][i][0])
                if x > start and x < (start + 100):
                    nearby_variation.append(normal_var_dict[chrom][i])

            #add chrom names to normal variation lists
            for i in range(len(nearby_variation)):
                nearby_variation[i] = [chrom]+nearby_variation[i]

            #selecting which normal variation to include for the window
            nearby_variation_random=[] 
            for i in range(len(nearby_variation)):
                freq = nearby_variation[i][4]
                if freq > 0.5:
                    included = random.choices([nearby_variation[i], "skip"], weights=[freq, 1-freq], k=1)
                else:
                    included = random.choices([nearby_variation[i], "skip"], weights=[0.5, 0.5], k=1)
                nearby_variation_random.append(included[0])

            #make changes and append sequence to files

            #the offset keeps track of effects of indels
            offset=np.zeros(100).astype(int)

            #add in the nearby normal variation
            for i in range(len(nearby_variation_random)):
                seq, offset = update_sequence(nearby_variation_random, seq, offset, start, chrom, current_organ)

            #if a big insertion leads to other variation outside the 100bp window those will 
            #append to the end and can be removed
            if len(seq)>100:
                seq=seq[:100]

            with open('{}_normal.csv'.format(current_organ), 'a') as file:
                file.write(current_sample + "," + seq + "," + ref_seq + "\n")

            #add in the mutations on top of normal variation
            for i in range(len(nearby_mutations)):
                seq, offset = update_sequence(nearby_mutations, seq, offset, start, chrom, current_organ)

            #if big insertion made a mutation was outside of the 100bp window it will have
            #appeneded to the end. So there may be no mutation in the 100bp window and it should
            #be discarded if over 100bp
            if len(seq)==100:
                with open('{}_tumor.csv'.format(current_organ), 'a') as file:
                    file.write(current_sample + "," + seq + "," + ref_seq + "\n")
        
        #clear the sample locations file
        
        !rm $locations




test zone

In [728]:
str(current_location) #excluded changes > 45bp as it caused errors in code

"('chr6', '52056727', 'TTAGACCAAAGGGGTTCCAGTTTGCATTTTACTGCAAGTAACTCCTCAATGGTTGTTTGAATCTA', 'TTAGACCAAAGGGGTTCCAGTTTGCATTTTACTGCAAGTAACTCCTCAATGGTTGTTTGAATCTA', '-')"

In [729]:
len_mutation = max(len(current_location[2]),len(current_location[4]))

In [730]:
len_mutation

65

In [737]:
int(current_location[1])-100+50

52056677

In [738]:
int(current_location[1])-50

52056677

In [731]:
range(int(current_location[1])-100+len_mutation,int(current_location[1])-len_mutation)

range(52056692, 52056662)

In [605]:
window_starts[-1]

154442011

In [714]:
nearby_mutations=[['chr1', 151198956, 'X', '-', 0.222]]

In [680]:
nearby_mutations[0]

['chr1', 151198956, 'X', 'X', '-']

In [688]:
chrom = nearby_mutations[0][0]
start = 151198856 + 10
end = start + 100

In [689]:
!rm temp_location.bed

In [690]:
with open('temp_location.bed', 'w') as file:
    file.write(chrom + "\t" + str(start) + "\t" + str(end))
additional_sequence = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed temp_location.bed

In [691]:
seq = additional_sequence[1]

In [692]:
seq

'CCGGAGGGGGCGGGGAGGTGGCCCACAGAACGCGGGTTCTGTAAAGAGACGTTGGGAAGATTCGATTCCGAGAAGAGGAAGAACCGGATTGAAAGAGAGC'

In [715]:
offset=np.zeros(100).astype(int)

In [718]:
update_sequence(nearby_mutations, seq, offset, start)

deletion


('CCGGAGGGGGCGGGGAGGTGGCCCACAGAACGCGGGTTCTGTAAAGAGACGTTGGGAAGATTCGATTCCGAGAAGAGGAAGAACCGGATGAAAGAGAGCG',
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1.]))

In [585]:
len('TACCTGGTGAAAGACTAATGAATTCATCGGACATGTTACTGTTTTTCCTCCCTGATGTACCACCAACTTTACGTTTGCATGAAGGTGGTxxxAACATTCT')

100

In [503]:
len('')

100

In [705]:
changes_list=nearby_mutations

In [625]:
changes_list = [['chrX', 154442101, 'C', '-', 0.00798276]]

In [621]:
if changes_list[i][2] != "-" and \
    (len(changes_list[i][2]) == len(changes_list[i][3]) and isinstance(changes_list[i][4],float)==True) or\
    (len(changes_list[i][2]) == len(changes_list[i][3])) and \
    (len(changes_list[i][2]) == len(str(changes_list[i][4])) and isinstance(changes_list[i][4],float) == False) :
    print('ok')


ok


In [623]:
if changes_list[i][2] == "-" or ( len(changes_list[i][2]) < len(str(changes_list[i][4])) and \
        isinstance(changes_list[i][4],float) == False):
    print('ok')



ok


In [626]:
if changes_list[i][4]=="-" or changes_list[i][3]=="-" or (len(str(changes_list[i][4])) < len(changes_list[i][2]) and \
        isinstance(changes_list[i][4],float) == False) :
    print('ok')



ok


In [637]:
for i in range(len(nearby_variation)):
                freq = nearby_variation[i][4]
                if freq > 0.5:
                    included = random.choices([nearby_variation[i], "skip"], weights=[freq, 1-freq], k=1)
                else:
                    included = random.choices([nearby_variation[i], "skip"], weights=[0.5, 0.5], k=1)
                nearby_variation_random.append(included[0])

In [638]:
nearby_variation_random

['skip',
 ['chrX', 154442101, 'C', 'T', 0.00798276],
 ['chrX', 154442101, 'C', 'T', 0.00798276],
 ['chrX', 154442101, 'C', 'T', 0.00798276],
 'skip']

In [707]:
changes_list

[['chr1', 151198956, 'X', 'X', '-']]

In [709]:
i=0

In [711]:
if changes_list[i][2] != "-" and \
            changes_list[i][4] != "-" and \
            ((len(changes_list[i][2]) == len(changes_list[i][3]) and isinstance(changes_list[i][4],float)==True) or\
            (len(changes_list[i][2]) == len(changes_list[i][3])) and \
            (len(changes_list[i][2]) == len(str(changes_list[i][4])) and isinstance(changes_list[i][4],float) == False)) :
    print('ok')



In [708]:
changes_list[0][4]

'-'