# makeconfigs-samples
## This file parses through our input data to determine similarity between samples and use that to create a karyotype and config file for a circos.ca figure

### first load the file into a data structure

In [1]:
import csv
import random

inpdat = "../../../3ormoreDATLOW.csv"
with open(inpdat,'r') as f:
    it = csv.reader(f)
    listit = list(it)

sampnames = [] # sample names stored here for quicker access
cimphigh = [] # data set of only foreground
#only view data for foreground
for row in listit:
    if(row[0] == ""):
        cimphigh.append(row)
        continue
    if(float(row[len(listit[0])-1]) == 1.0):
        cimphigh.append(row)
        sampnames.append(row[0])

#print number of samples in foreground
numsamps = len(cimphigh)-1
numfeats = len(cimphigh[0])-2
print("number of samples:",numsamps)
print("number of features:",numfeats)

number of samples: 32
number of features: 17


### next parse through the features and generate "similarity scores" (counts of similar features present between pairwise relationships)

In [2]:
similarities = {} # pairwise scores/counts of similarity stored here
for x in range(1,len(cimphigh)):
    for y in range(1,len(cimphigh)): # iterate n^2 to get all pairwise relationships
        if(cimphigh[x][0] == cimphigh[y][0] or (cimphigh[y][0],cimphigh[x][0]) in similarities): # skip relationships between samples with themselves, or if already in dict
            continue
        # calculate number of similar features between two samples
        counter = 0
        for k in range(1,len(cimphigh[0])-1):
            if(float(cimphigh[x][k]) == 1.0 and float(cimphigh[y][k]) == 1.0):
                counter += 1
        similarities[(cimphigh[x][0],cimphigh[y][0])] = counter

similarities

{('TCGA-A6-2672', 'TCGA-A6-2676'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3516'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3518'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3525'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3543'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3664'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3672'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3710'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3715'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3821'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3833'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3837'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3845'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3870'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3877'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3941'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-3947'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3949'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-3994'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00A'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00D'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A00E'): 0,
 ('TCGA-A6-2672', 'TCGA-AA-A00J'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A01P'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A022'): 1,
 ('TCGA-A6-2672', 'TCGA-AA-A029'): 0,
 ('TCGA-A6-2

### open and set up a karyotype file.
### start by defining "chromosomes", eg nodes of the circo

In [3]:
kary = open("karyotype.txt","w")

# obtain n colors, algorithm written by Vivek Anandan user on Quora
colors = []
r = int(random.random() * 256)
g = int(random.random() * 256)
b = int(random.random() * 256)
step = 256/numsamps
for i in range(numsamps):
    r += step
    g += step
    b += step
    r = int(r) % 256
    g = int(g) % 256
    b = int(b) % 256
    colors.append((r,g,b))

kary.write("# initialization of samples\n")
# format: chr - ID LABEL START END COLOR
for x in range(0,len(cimphigh)-1):
    # max thickness of the line is the total number of features
    kary.write("chr - %s %s %d %d green\n" % (cimphigh[x+1][0],cimphigh[x+1][0], (x*6), (x*6)+numfeats)) # defines a "chromosome"

### create file with links (the connections between samples)

In [4]:
# close up the karyotype file and make a links file
kary.close()
links = open("links.txt","w")

links.write("# make links\n")
#iterate over similarity scores and create a link if the score is at least 1
count = 0
for t1, t2 in similarities:
    val = similarities[(t1,t2)]
    if(val > 0):
        count+=1
        # link thickness will be number of similarity
        links.write("link%d %s %d %d z=0,color=blue\n" % (count, t1, (sampnames.index(t1)*6), (sampnames.index(t1)*6)+val))
        links.write("link%d %s %d %d z=0,color=blue\n" % (count, t2, (sampnames.index(t2)*6), (sampnames.index(t2)*6)+val))

### next up is creating the config file. load up data and set rules for links

In [5]:
# close up the link file and make a config file
links.close()
conf = open("circos.conf","w")


#required config lines
conf.write("\n\n\n# required files from circos.ca\n")
#conf.write("<image>\n")
conf.write("<<include etc/image.conf>>\n")
conf.write("<<include etc/colors_fonts_patterns.conf>>\n")
conf.write("<<include etc/housekeeping.conf>>\n")
#conf.write("</image>\n")

# link karyotype
conf.write("karyotype = karyotype.txt\n\n")

# setup links
conf.write("# add links\n")
conf.write("<links>\n")
conf.write("show = yes\n")
conf.write("ribbon = yes\n")
conf.write("flat = yes\n")
conf.write("radius = 0.975r\n")
conf.write("bezier_radius = 0r\n")
conf.write("color = black_a5\n")
conf.write("thickness = 2\n")

conf.write("<link>\n")
conf.write("file = links.txt\n")

#set rules for line thickness based on how strong similarity is

conf.write("</link>\n")
conf.write("</links>\n")

# setup image
conf.write("\n# setup image\n")
conf.write("<image>\n")
conf.write("<<include etc/image.conf>>\n")
conf.write("</image>\n")

# setup ideogram
conf.write("\n# setup ideogram\n")
conf.write("<ideogram>\n")
conf.write("show_label = yes\n")
conf.write("label_font = default\n")
conf.write("label_radius = dims(ideogram,radius) + 0.075r\n")
conf.write("label_size = 28\n")
conf.write("label_parallel = yes\n")
conf.write("<spacing>\n")
conf.write("default = .07r\n")
conf.write("break = .5r\n")
conf.write("</spacing>\n\n")
conf.write("thickness = 20p\n")
conf.write("fill = yes\n")
conf.write("radius = 0.90r\n")
conf.write("</ideogram>")


conf.close()