# makeconfigs-samples-orderbygroup
## This file parses through our input data to determine similarity between samples and use that to create a karyotype and config file for a circos.ca figure

### first load the file into a data structure

In [1]:
import csv
import random

inpdat = "../../../3ormoreDATLOW.csv"
with open(inpdat,'r') as f:
    it = csv.reader(f)
    listit = list(it)

sampnames = [] # sample names stored here for quicker access
cimphigh = [] # data set of only foreground
#only view data for foreground
for row in listit:
    if(row[0] == ""):
        cimphigh.append(row)
        continue
    if(float(row[len(listit[0])-1]) == 1.0):
        cimphigh.append(row)
        sampnames.append(row[0])

#print number of samples in foreground
numsamps = len(cimphigh)-1
numfeats = len(cimphigh[0])-2
print("number of samples:",numsamps)
print("number of features:",numfeats)
cimphigh

number of samples: 32
number of features: 17


[['',
  'BRAF_GRCh37_7:140453136-140453136_Missense-Mutation-SNP-A-A-T_Missense-Mutation-SNP-A-T-T',
  'GRB14_GRCh37_2:165365288-165365296_In-Frame-Del-DEL-TTTTTTTTT----',
  'MBD4_GRCh37_3:129155548-129155557_Frame-Shift-Del-DEL-TTTTTTTTTT----',
  'MSH6_GRCh37_2:48030640-48030647_Frame-Shift-Del-DEL-CCCCCCCC----',
  'PRDM2_GRCh37_1:14108749-14108757_In-Frame-Del-DEL-AAAAAAAAA----',
  'HMMR_GRCh37_5:162917426-162917434_In-Frame-Del-DEL-AAAAAAAAA----',
  'SEC63_GRCh37_6:108214755-108214764_Frame-Shift-Del-DEL-TTTTTTTTTT----',
  'KRAS_GRCh37_12:25398284-25398284_Missense-Mutation-SNP-C-T-T',
  'PIK3CA_GRCh37_3:178952085-178952085_Missense-Mutation-SNP-A-G-G',
  'RAD50_GRCh37_5:131931452-131931460_In-Frame-Del-DEL-AAAAAAAAA----',
  'ACVR2A_GRCh37_2:148683686-148683693_Frame-Shift-Del-DEL-AAAAAAAA----',
  'CASP5_GRCh37_11:104878041-104878050_Frame-Shift-Del-DEL-TTTTTTTTTT----',
  'MSH3_GRCh37_5:79970915-79970922_Frame-Shift-Del-DEL-AAAAAAAA----',
  'ATR_GRCh37_3:142274740-142274749_Frame-Sh

### next parse through the features and generate "similarity scores" (counts of similar features present between pairwise relationships)

In [2]:
similarities = {} # pairwise scores/counts of similarity stored here
for x in range(1,len(cimphigh)):
    for y in range(1,len(cimphigh)): # iterate n^2 to get all pairwise relationships
        if(cimphigh[x][0] == cimphigh[y][0] or (cimphigh[y][0],cimphigh[x][0]) in similarities): # skip relationships between samples with themselves, or if already in dict
            continue
        # calculate number of similar features between two samples
        counter = 0
        for k in range(1,len(cimphigh[0])-1):
            if(float(cimphigh[x][k]) == 1.0 and float(cimphigh[y][k]) == 1.0):
                counter += 1
        similarities[(cimphigh[x][0],cimphigh[y][0])] = counter

similarities

{('TCGA-A6-2672', 'TCGA-A6-2676'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3516'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3518'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3525'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3543'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3664'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3672'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3710'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3715'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3821'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3833'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3837'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3845'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3870'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3877'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3941'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3947'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3949'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3994'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00A'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00D'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A00E'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00J'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A01P'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A022'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A029'): 0,
 ('TCGA-A6-2

### open and set up a karyotype file.
### start by defining "chromosomes", eg nodes of the circo

In [3]:
kary = open("karyotype.txt","w")

# obtain n colors, algorithm written by Vivek Anandan user on Quora
colors = []
r = int(random.random() * 256)
g = int(random.random() * 256)
b = int(random.random() * 256)
step = 256/numsamps
for i in range(numsamps):
    r += step
    g += step
    b += step
    r = int(r) % 256
    g = int(g) % 256
    b = int(b) % 256
    colors.append((r,g,b))

# get full names of KRAS and BRAF (because they can change from data set to data set)
# note this assumes you only have one kras or braf feature in the set (you should check if unsure)
for y in range(1,len(cimphigh[0])-1):
    if(cimphigh[0][y].split("_")[0] == "KRAS"):
        krasname = cimphigh[0][y]
    if(cimphigh[0][y].split("_")[0] == "BRAF"):
        brafname = cimphigh[0][y]

# oddball yellows      
separate = ["TCGA-AA-3837","TCGA-AA-A029","TCGA-AG-3600","TCGA-AG-A02X"]

# create groupings of samples to use
neworder = [cimphigh[0]]
newsampnames = []
braf = []
both = []
kras = []
kras2 = []
neither = []
# sort our samples into respective groups first
for x in range(1,len(cimphigh)):
    if(float(cimphigh[x][cimphigh[0].index(brafname)]) == 1.0 and float(cimphigh[x][cimphigh[0].index(krasname)]) == 1.0):
        both.append(cimphigh[x])
    elif(float(cimphigh[x][cimphigh[0].index(brafname)]) == 1.0):
        braf.append(cimphigh[x])
    elif(float(cimphigh[x][cimphigh[0].index(krasname)]) == 1.0):
        kras.append(cimphigh[x])
    else:
        if(cimphigh[x][0] in separate):
            kras2.append(cimphigh[x])
        else:
            neither.append(cimphigh[x])
kras = kras2+kras
# order KRAS so empty samples are at the front
n1 = []
n2 = []
for x in range (0, len(neither)):
    haslinks = False
    for y in range(1,len(neither[0])-1): # iterate n^2 to get all pairwise relationships
        if(float(neither[x][y]) == 1.0): # skip relationships between samples with themselves, or if already in dict
            haslinks = True
    if(haslinks):
        n2.append(neither[x])
    else:
        n1.append(neither[x])
neither = n1+n2
print(kras)
# order of groups: both KRAS and BRAF, only BRAF, only KRAS, neither
neworder += braf
neworder += both
neworder += kras
neworder += neither
for x in range(1,len(neworder)):
    newsampnames.append(neworder[x][0])

kary.write("# initialization of samples\n")
# format: chr - ID LABEL START END COLOR
for x in range(0,len(neworder)-1):
    # color by both/braf/kras/neither
    if(float(neworder[x+1][neworder[0].index(brafname)]) == 1.0 and float(neworder[x+1][neworder[0].index(krasname)]) == 1.0):
        kary.write("chr - %s %s %d %d purple\n" % (neworder[x+1][0],neworder[x+1][0], (x*numfeats), (x*numfeats)+numfeats)) # defines a "chromosome"
    elif(float(neworder[x+1][neworder[0].index(krasname)]) == 1.0):
        kary.write("chr - %s %s %d %d red\n" % (neworder[x+1][0],neworder[x+1][0], (x*numfeats), (x*numfeats)+numfeats)) # defines a "chromosome"
    elif(float(neworder[x+1][neworder[0].index(brafname)]) == 1.0):
        kary.write("chr - %s %s %d %d blue\n" % (neworder[x+1][0],neworder[x+1][0], (x*numfeats), (x*numfeats)+numfeats)) # defines a "chromosome"
    else:
        kary.write("chr - %s %s %d %d yellow\n" % (neworder[x+1][0],neworder[x+1][0], (x*numfeats), (x*numfeats)+numfeats)) # defines a "chromosome"
    
len(both)+len(braf)+len(kras)+len(neither)
n1

[['TCGA-AA-3837', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-A029', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AG-3600', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AG-A02X', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-3870', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-3941', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-3994', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-A02O', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['TCGA-AA-A03F', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1']]


[]

### create file with links (the connections between samples)

In [4]:
# close up the karyotype file and make a links file
kary.close()
links = open("links.txt","w")

links.write("# make links\n")
#iterate over similarity scores and create a link if the score is at least 1
count = 0
for t1, t2 in similarities:
    val = similarities[(t1,t2)]
    if(val > 0):
        count+=1
        # link thickness will be number of similarity
        links.write("link%d %s %d %d z=0,color=blue\n" % (count, t1, (newsampnames.index(t1)*numfeats), (newsampnames.index(t1)*numfeats)+val))
        links.write("link%d %s %d %d z=0,color=blue\n" % (count, t2, (newsampnames.index(t2)*numfeats), (newsampnames.index(t2)*numfeats)+val))

### next up is creating the config file. load up data and set rules for links

In [5]:
# close up the link file and make a config file
links.close()
conf = open("circos.conf","w")


#required config lines
conf.write("\n\n\n# required files from circos.ca\n")
#conf.write("<image>\n")
conf.write("<<include etc/image.conf>>\n")
conf.write("<<include etc/colors_fonts_patterns.conf>>\n")
conf.write("<<include etc/housekeeping.conf>>\n")
#conf.write("</image>\n")

# link karyotype
conf.write("karyotype = karyotype.txt\n\n")

# setup links
conf.write("# add links\n")
conf.write("<links>\n")
conf.write("show = yes\n")
conf.write("ribbon = yes\n")
conf.write("flat = yes\n")
conf.write("radius = 0.975r\n")
conf.write("bezier_radius = 0r\n")
conf.write("color = black_a5\n")
conf.write("thickness = 2\n")

conf.write("<link>\n")
conf.write("file = links.txt\n")

conf.write("</link>\n")
conf.write("</links>\n")

# setup image
conf.write("\n# setup image\n")
conf.write("<image>\n")
conf.write("<<include etc/image.conf>>\n")
conf.write("</image>\n")

# setup ideogram
conf.write("\n# setup ideogram\n")
conf.write("<ideogram>\n")
conf.write("show_label = yes\n")
conf.write("label_font = default\n")
conf.write("label_radius = dims(ideogram,radius) + 0.075r\n")
conf.write("label_size = 28\n")
conf.write("label_parallel = yes\n")
conf.write("<spacing>\n")
conf.write("default = .07r\n")
conf.write("break = .5r\n")
conf.write("</spacing>\n\n")
conf.write("thickness = 20p\n")
conf.write("fill = yes\n")
conf.write("radius = 0.90r\n")
conf.write("</ideogram>")


conf.close()