In [1]:
import pandas as pd
import yaml
import csv

genedat = pd.read_csv("/home/cjulia/biosproj/firstproj/mutations.csv",index_col=0)
mutdat = pd.read_csv("/home/cjulia/biosproj/firstproj/mutfeats_Ovarian_and_uterine_cell_lines.csv",index_col=0)

genedat

BTBD11_GRCh38_12:107544001-107544001_Frame-Shift-Del_DEL_C-C--
DDX3X_GRCh38_X:41347847-41347847_3'UTR_DEL_T-T--
LMAN1_GRCh38_18:59346053-59346053_Splice-Site_DEL_T-T--
ZMIZ1_GRCh38_10:79312689-79312689_Frame-Shift-Del_DEL_C-C--
PLXND1_GRCh38_3:129555548-129555548_3'UTR_DEL_A-A--
ARID1A_GRCh38_1:26779439-26779440_Frame-Shift-Ins_INS_----G
SRRT_GRCh38_7:100881710-100881711_Frame-Shift-Ins_INS_----G
TTC3_GRCh38_21:37151943-37151943_Frame-Shift-Del_DEL_A-A--
KCNA4_GRCh38_11:30012016-30012016_Frame-Shift-Del_DEL_A-A--


In [2]:
# make sample labels
samplabs = [["Sample: "+x for x in mutdat.index]]
samplabs.append(["Tumor: Uterine" if x[0]=="K" else "Tumor: Ovarian" for x in mutdat.index])
samplabs.append(["CIMPclass: CIMP+" if mutdat.at[x,"class"]==1 else "CIMPclass: CIMPi" if mutdat.at[x,"class"]==2 else "CIMPclass: CIMP-" for x in mutdat.index])
samplabs = [["",""] + x for x in samplabs] # add additional spacing to make room for gene labels
mutdat = mutdat.drop("class",axis=1) # no longer need class column

samplabs

[['',
  '',
  'Sample: SJ11',
  'Sample: SJ7',
  'Sample: KF2',
  'Sample: SJ6',
  'Sample: SJ10',
  'Sample: KF6',
  'Sample: KF1',
  'Sample: SJ5',
  'Sample: SJ2',
  'Sample: KF7',
  'Sample: SJ9',
  'Sample: KF9',
  'Sample: KF3',
  'Sample: KF8',
  'Sample: KF4',
  'Sample: SJ1',
  'Sample: SJ4',
  'Sample: KF5',
  'Sample: SJ8',
  'Sample: SJ3'],
 ['',
  '',
  'Tumor: Ovarian',
  'Tumor: Ovarian',
  'Tumor: Uterine',
  'Tumor: Ovarian',
  'Tumor: Ovarian',
  'Tumor: Uterine',
  'Tumor: Uterine',
  'Tumor: Ovarian',
  'Tumor: Ovarian',
  'Tumor: Uterine',
  'Tumor: Ovarian',
  'Tumor: Uterine',
  'Tumor: Uterine',
  'Tumor: Uterine',
  'Tumor: Uterine',
  'Tumor: Ovarian',
  'Tumor: Ovarian',
  'Tumor: Uterine',
  'Tumor: Ovarian',
  'Tumor: Ovarian'],
 ['',
  '',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMP+',
  'CIMPclass: CIMPi',
  'CIMPclass: CIMPi',
  'CIMPclass: CIMPi',
  'CIMPclass: CIMP-',


In [3]:
# get DDR labels, chrom remod and wnt labels
DDRgenelabels = yaml.load(open('/home/cjulia/biosproj/firstproj/DDR_genes.yml'), Loader=yaml.Loader)
wntgenes = open("/home/cjulia/biosproj/firstproj/wnthumanlist.txt").read().splitlines()
crgenes = open("/home/cjulia/biosproj/firstproj/kmeans.csv").read().splitlines()

# function to get gene types for a list of genes
def gettypes(genelist):
    out = []
    for gene in genelist: # make Gene Type label
        for k in DDRgenelabels.keys(): # check if dna repair gene
            if(gene in DDRgenelabels[k]):
                out.append("Gene Type: "+"DDR-"+k)
                continue
        if(gene in wntgenes):
            out.append("Gene Type: Wnt")
        elif(gene in crgenes):
            out.append("Gene Type: Chromatin-Remodelling")
        else:
            out.append("Gene Type: N/A")
    return out

# make gene labels
genelabs = []
typestmp = gettypes(genedat.columns) # get gene types for all genes in order seen in input
for gene,gtype in zip(genedat.columns,typestmp): # get the gene name and its associated type in a loop
    genelabs.append(["Gene: "+gene,gtype])

# make feature labels
mutlabs = [] # do the same thing above but for the feature data
typestmp = gettypes([x.split("_")[0] for x in mutdat.columns])
featnames = [x.replace("_","-") for x in mutdat.columns] # sadly clustergrammer doesn't like underscores, so we need to rename features
for feat,gtype in zip(featnames,typestmp):
    mutlabs.append(["Feature: "+feat,gtype])

# make data for gene muts and feature muts
def makecgdat(myvalues,mylabels):
    out = [x for x in samplabs] # deep copy sample labels
    for row,lab in zip(myvalues,mylabels):
        out.append(lab+list(row))
    return out

geneout = makecgdat(genedat.T.values,genelabs)
featout = makecgdat(mutdat.T.values,mutlabs)

# write outputs
with open("/home/cjulia/biosproj/firstproj/nci-genedat.txt", "w") as csv_file:
    write = csv.writer(csv_file, delimiter ="\t")
    write.writerows(geneout)
with open("/home/cjulia/biosproj/firstproj/nci-featdat.txt", "w") as csv_file:
    write = csv.writer(csv_file, delimiter ="\t")
    write.writerows(featout)

print("done")

done
