Read data

In [1]:
l = []
for sp in ["Crotalus_viridis", "Naja_naja"]:
    df = pd.read_csv("../liftover/corn.{}.tab.gz".format(sp), 
                     sep="\t", header=None)
    df.columns = ["corn_scaf", "corn_start", "corn_end", 
                  "sp2_scaf", "sp2_start", "sp2_end"]
    df["sp2"] = sp
    l.append(df)
lift = pd.concat(l)
lift["corn_width"] = lift.corn_end - lift.corn_start

Convert sp2 chrom names

In [2]:
r2rdf = pd.read_csv("../../gcstar/misc_files/rattle2rattle_offset.txt",sep="\t",header=None)
r2r = r2rdf.set_index(2)[0].to_dict()
naja_chroms = lift[lift.sp2=="Naja_naja"].sp2_scaf.unique()
n2n = {c:("scaffold-mi{}".format(c.split("_")[-1]) if c.startswith("MIC") else "scaffold-ma{}".format(c)) for c in naja_chroms if "SOZ" not in c}

In [3]:
new_scafs = []
for i,r in lift.iterrows():
    if r.sp2=="Naja_naja":
        if r.sp2_scaf in n2n:
            new_scafs.append(n2n[r.sp2_scaf])
        else:
            new_scafs.append(r.sp2_scaf)
    else:
        if r.sp2_scaf in r2r:
            new_scafs.append(r2r[r.sp2_scaf])
        else:
            new_scafs.append(r.sp2_scaf)
lift["sp2_chrom"] = new_scafs

Get best matching chromosome

In [4]:
bps = lift.groupby(["corn_scaf", "sp2_chrom", "sp2"])["corn_width"].sum().reset_index()
bps = bps.rename(columns={"corn_width":"bps"})
totals = bps.groupby(["corn_scaf","sp2"])["bps"].sum().to_dict()
bps["frac"] = [r.bps/totals[(r.corn_scaf, r.sp2)] for i,r in bps.iterrows()]
bps["total"] = [totals[(r.corn_scaf, r.sp2)] for i,r in bps.iterrows()]

In [5]:
high_match = bps[(bps.frac>0.1)]

classification_l = []
for (sp, scaf),df in high_match.groupby(["sp2", "corn_scaf"]):
    sp2_chroms = df.sp2_chrom.values
    if all("-ma" in chrom for chrom in sp2_chroms):
        classification_l.append([scaf,
                                 sp, 
                                 ":".join(sp2_chroms), 
                                 df.frac.sum(), 
                                 df.total.values[0], 
                                 "macro"])
    elif all("-mi" in chrom for chrom in sp2_chroms):
        classification_l.append([scaf,
                                 sp, 
                                 ":".join(sp2_chroms), 
                                 df.frac.sum(), 
                                 df.total.values[0], 
                                 "micro"])
    else:
        classification_l.append([scaf,
                                 sp, 
                                 ":".join(sp2_chroms), 
                                 df.frac.sum(), 
                                 df.total.values[0], 
                                 "unk"])       
classif = pd.DataFrame(classification_l)
classif.columns = ["scaf", "sp2", "sp2_chroms", "frac", "total", "nature"]

In [142]:
collapse_sp2 = classif.groupby(["scaf", "nature"])["sp2"].count().reset_index()

In [143]:
smallchroms = ['Super_scaffold_353', 'Super_scaffold_360', 'Super_scaffold_368', 'Super_scaffold_373', 'Super_scaffold_378', 'Super_scaffold_383', 'Super_scaffold_393', 'Super_scaffold_396', 'Super_scaffold_397', 'Super_scaffold_405', 'Super_scaffold_406', 'Super_scaffold_415', 'Super_scaffold_422']
largechroms = ['Super_scaffold_100', 'Super_scaffold_106', 'Super_scaffold_110', 'Super_scaffold_140', 'Super_scaffold_141', 'Super_scaffold_147', 'Super_scaffold_167', 'Super_scaffold_177', 'Super_scaffold_226', 'Super_scaffold_251', 'Super_scaffold_252', 'Super_scaffold_337', 'Super_scaffold_343', 'Super_scaffold_344', 'Super_scaffold_347', 'Super_scaffold_349', 'Super_scaffold_350', 'Super_scaffold_351', 'Super_scaffold_352', 'Super_scaffold_354', 'Super_scaffold_355', 'Super_scaffold_356', 'Super_scaffold_357', 'Super_scaffold_358', 'Super_scaffold_361', 'Super_scaffold_364', 'Super_scaffold_367', 'Super_scaffold_369', 'Super_scaffold_371', 'Super_scaffold_372', 'Super_scaffold_375', 'Super_scaffold_379', 'Super_scaffold_380', 'Super_scaffold_385', 'Super_scaffold_386', 'Super_scaffold_388', 'Super_scaffold_391', 'Super_scaffold_392', 'Super_scaffold_398', 'Super_scaffold_4', 'Super_scaffold_400', 'Super_scaffold_401', 'Super_scaffold_403', 'Super_scaffold_412', 'Super_scaffold_414', 'Super_scaffold_416', 'Super_scaffold_419', 'Super_scaffold_424', 'Super_scaffold_425', 'Super_scaffold_428', 'Super_scaffold_429', 'Super_scaffold_436', 'Super_scaffold_438', 'Super_scaffold_69', 'Super_scaffold_91']

In [144]:
agreement = collapse_sp2[collapse_sp2.sp2==2]

carla_k = ["micro", "macro"]

for nat,df in agreement.groupby("nature"):
    scafs = list(df.scaf.values)
    for i,carla_l in enumerate([smallchroms, largechroms]):
        isec = list(set(scafs) & set(carla_l))
        print(nat, carla_k[i], len(scafs), len(carla_l), len(isec))

macro micro 141 13 0
macro macro 141 55 33
micro micro 63 13 12
micro macro 63 55 0
unk micro 66 13 0
unk macro 66 55 5


In [146]:
agreement[["scaf","nature"]].to_csv("corn_macro_micro.tab",sep="\t", index=False)