In [1]:
import os
import random
from collections import defaultdict
import numpy as np
import copy

In [35]:
os.environ['PATH'] = '/opt/anaconda3/bin:/opt/anaconda3/condabin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'

In [2]:
import pickle

In [30]:
with open('gnomad_dictionary_2.pickle','rb') as read_file:
    normal_var_dict=pickle.load(read_file)

In [906]:
#create the functions for base change, insertion, deletion
def update_sequence(changes_list, seq, offset, start):
    
    #if list doesn't contain any changes just return the same seq and offset
    if len(changes_list) == 1 and changes_list[0] == "skip":
        return seq, offset
            
    
    else:
        #loop over changes list
        for i in range(len(changes_list)):
            
            #do nothing if it is a skip
            if changes_list[i] == "skip":
                continue

            #if it's a base change do this:
            elif len(changes_list[i][2]) == 1 and changes_list[i][2] != "-":
                if changes_list[i][2]!=changes_list[i][3]:
                    new_base = changes_list[i][3]
                else:
                    new_base = changes_list[i][4]
                ref_pos = int(changes_list[i][1]) -1
                adjust = int(offset[ref_pos - start])

                seq = seq[0:ref_pos + adjust - start] + new_base + seq[ref_pos + adjust - start + 1:]

            #if it's an insertion do this:
            elif changes_list[i][2] == "-":
                if changes_list[i][2]!=changes_list[i][3]:
                    new_base = changes_list[i][3]
                else:
                    new_base = changes_list[i][4]
                ref_pos = int(changes_list[i][1]) -1
                adjust = int(offset[ref_pos - start])


                seq = seq[0:ref_pos + adjust - start + 1] + new_base + seq[ref_pos + adjust - start : 100-len(new_base)]

                if len(seq) > 100:
                    seq = seq[:100]
            
                offset= np.add(offset,  np.concatenate((np.zeros(ref_pos-start+1),np.repeat(len(new_base), start+100-ref_pos-1))))
                
            #if it's a deletion do this:
            elif changes_list[i][4]=="-" or changes_list[i][3]=="-":
                new_base = ""
                
                ref_pos = int(changes_list[i][1]) # no -1 here
                adjust = int(offset[ref_pos - start])
                expected_base = changes_list[i][2]

                seq = seq[0:ref_pos + adjust - start] + new_base + seq[ref_pos + adjust -start+len(expected_base):(100-len(new_base)+1)] 
                
                if len(seq) < 100:
                    current_end = len(seq) + start - int(offset[len(seq)]) -1
                    
                    with open('temp_location.bed', 'w') as file:
                        file.write(chrom + "\t" + str(current_end) + "\t" + str(current_end+len(expected_base)-len(new_base)))
                    additional_sequence = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed temp_location.bed
                    seq = seq + additional_sequence[1]
                    !rm temp_location.bed

                offset=np.add(offset, np.concatenate((np.zeros(ref_pos-start),np.repeat(-len(expected_base), start+100-ref_pos))))

                
                
        return seq, offset
            

In [12]:
#get the samples list from the TCGA MAF file - here breast cancer
os.chdir("/Users/beth/Documents/SNV project/gdc_download_20200914_095313.532292/995c0111-d90b-4140-bee7-3845436c3b42")

TCGA_file = "TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf"

samples = !tail -n +7 $TCGA_file | cut -f16 | sort | uniq

current_organ = "breast"

In [707]:
#could run the loop once for each tumor type TCGA MAF file ... need to parrallelize 

#looping over each sample
for i in range(len(samples[0:2])):
    current_sample = samples[i]
    sample_mutations = !grep $current_sample $TCGA_file | tail -n +2 | cut -f5,6,11,12,13
    sample_mutations = tuple(map(lambda x: tuple(x.split('\t')), sample_mutations))
    sample_mutations_dict = defaultdict(list)
    
    #creating sample_mutations_dict
    for i in range(len(sample_mutations)):
        sample_mutations_dict[sample_mutations[i][0]].append([int(sample_mutations[i][1]), sample_mutations[i][2], sample_mutations[i][3], sample_mutations[i][4]])
        
    
    #looping over each mutation in sample_mutations list of tuples 
    for i in range(len(sample_mutations)):
        current_location = sample_mutations[i]
        
        #to get 10 windows of 100bp around it (note: hg38 is one position behind both MAF and VCF)
        window_starts=[]
        for j in range(10):
            window = range(int(current_location[1])-100,int(current_location[1])-1)
            start = random.choice(list(window))
            end = start + 100
            chrom = current_location[0]
            window_starts.append(start)
            with open("sample_locations.tmp", "a") as file:
                file.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n")
            
        #getting the hg38 sequences for the 10 windows        
        ref_seqs = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed sample_locations.tmp
        
        #loop once for each window
        for i in range(1,len(ref_seqs),2):
            #getting the seq
            ref_seq= ref_seqs[i]
            seq=copy.copy(ref_seq)
            start=window_starts[i//2]
            
            #getting a list of tuples for each mutation within the window
            nearby_mutations=[current_location]
            for i in range(len(sample_mutations_dict[chrom])):
                x = int(sample_mutations_dict[chrom][i][0])
                if x > int(start) and x < (int(start) + 100) and x != int(current_location[1]):
                    nearby_mutations.append(sample_mutations_dict[chrom][i])
            
            #add chrom names to mutation lists
            for i in range(1,len(nearby_mutations)):
                nearby_mutations[i] = [chrom]+nearby_mutations[i]
                
            #getting a list of tuples for possible normal variation within the window
            nearby_variation=[]
            for i in range(len(normal_var_dict[chrom])):
                x = int(normal_var_dict[chrom][i][0])
                if x > start and x < (start + 100):
                    nearby_variation.append(normal_var_dict[chrom][i])
            
            #add chrom names to normal variation lists
            for i in range(len(nearby_variation)):
                nearby_variation[i] = [chrom]+nearby_variation[i]
                    
            #selecting which normal variation to include for the window
            nearby_variation_random=[] 
            for i in range(len(nearby_variation)):
                freq = nearby_variation[i][4]
                if freq > 0.2:
                    included = random.choices([nearby_variation[i], "skip"], weights=[freq, 1-freq], k=1)
                else:
                    included = random.choices([nearby_variation[i], "skip"], weights=[0.2, 0.8], k=1)
                nearby_variation_random.append(included[0])
            
            #make changes and append sequence to files
            
            #the offset keeps track of effects of indels
            offset=np.zeros(100).astype(int)
        
            #add in the nearby normal variation
            for i in range(len(nearby_variation_random)):
                seq, offset = update_sequence(nearby_variation_random, seq, offset, start)
            
            #if a big insertion leads to other variation outside the 100bp window those will 
            #append to the end and can be removed
            if len(seq)>100:
                seq=seq[:100]

            with open('{}_normal.csv'.format(current_organ), 'a') as file:
                file.write(current_sample + "," + seq + "," + ref_seq + "\n")
            
            #add in the mutations on top of normal variation
            for i in range(len(nearby_mutations)):
                seq, offset = update_sequence(nearby_mutations, seq, offset, start)
                
            #if big insertion made a mutation was outside of the 100bp window it will have
            #appeneded to the end. So there may be no mutation in the 100bp window and it should
            #be discarded if over 100bp
            if len(seq)==100:
                with open('{}_tumor.csv'.format(current_organ), 'a') as file:
                    file.write(current_sample + "," + seq + "," + ref_seq + "\n")

        !rm sample_locations.tmp
        
     
            
        

test zone

In [937]:
sample_mutations[250:270]

(('chr6', '47626538', 'C', 'C', 'A'),
 ('chr6', '52420447', 'C', 'C', 'T'),
 ('chr6', '53001844', 'G', 'G', 'A'),
 ('chr6', '53011913', 'C', 'C', 'G'),
 ('chr6', '56562135', 'C', 'C', 'A'),
 ('chr6', '73621919', 'G', 'G', 'C'),
 ('chr6', '90008607', 'C', 'C', 'G'),
 ('chr6', '93242465', 'G', 'G', 'T'),
 ('chr6', '96615760', 'C', 'C', 'A'),
 ('chr6', '108168060', 'G', 'G', 'C'),
 ('chr6', '112099347', 'G', 'G', 'A'),
 ('chr6', '129580081', 'C', 'C', 'G'),
 ('chr6', '136346031', 'C', 'C', 'G'),
 ('chr6', '137206225', 'G', 'G', 'C'),
 ('chr6', '143433832', 'C', 'C', 'G'),
 ('chr6', '143843376', 'G', 'G', 'A'),
 ('chr6', '144751813', 'G', 'G', 'C'),
 ('chr6', '151350204', 'G', 'G', 'A'),
 ('chr6', '154107953', 'TTTTTA', 'TTTTTA', '-'),
 ('chr6', '160046523', 'C', 'C', 'A'))

In [938]:
nearby_mutations=[('chr6', '154107953', 'TTTTTA', 'TTTTTA', '-')]

In [939]:
nearby_mutations[0]

('chr6', '154107953', 'TTTTTA', 'TTTTTA', '-')

In [940]:
chrom = nearby_mutations[0][0]
start = int(nearby_mutations[0][1]) -10
end = start + 100

In [941]:
!rm temp_location.bed

rm: temp_location.bed: No such file or directory


In [942]:
with open('temp_location.bed', 'w') as file:
    file.write(chrom + "\t" + str(start) + "\t" + str(end))
additional_sequence = !bedtools getfasta -fi /Volumes/BethMac/hg38/hg38.fa -bed temp_location.bed

In [943]:
seq = additional_sequence[1]

In [944]:
seq

'acactgatttttttattttattttattttattttattttattttattgccattcattcaaccgtttgcacagagagaaagaagacagaaatctgactggt'

In [945]:
offset=np.zeros(100).astype(int)

In [946]:
update_sequence(nearby_mutations, seq, offset, start)

('acactgattttttattttattttattttattttattttattgccattcattcaaccgtttgcacagagagaaagaagacagaaatctgactggtgactgg',
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
        -6., -6., -6., -6., -6., -6., -6., -6., -6.]))