# Prelim Viz

Run cd-hit, about the data, get FastTrees. 
_faire fonctionner `cd-hit`, à propos des données, obtiner `RapideAbre`._

### About the data

* Get distributions by continent/country over time (year), saved in `/figures`

### cd-hit

* For each country and year, if that country during that year has > 300 records, do cd-hit at `-c 1.0` to remove duplicates. 
* This means that if some country at some year, say, Siberia 2001, only has 10 records, 5 of which are duplicates, then nothing would be done. But that's only 4 records, so impact would be minimal. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import subprocess
import os
import json

import xio
import wrapomatic as wrp


In [2]:
fn_in = "d1_e_5806_msa.fas"

contents = xio.read_fasta("data/"+fn_in, delimiter="|", preview=0)
fas_cols = ["id", "host", "geoloc", "gtype", "cdate"]
d0 = pd.DataFrame(data=contents, columns=fas_cols+["seq"])

with open('data/cc_dict.json') as handle:
    cc_dict = json.loads(handle.read())

d0['continent'] = d0.apply(lambda row: xio.lookup_key(cc_dict, str(row["geoloc"]).split("/")[0]), axis=1)
d0["country"] = d0.apply(lambda row: str(row["geoloc"]).split("/")[0], axis=1)
d0["cyear"] = d0.apply(lambda row: str(row["cdate"])[:4], axis=1)

# Écrire csvs
fn_out1 = fn_in[2:]+"_e_cc_tbl.csv"
d1 = d0.groupby(["continent", "cyear"]).size().to_frame().reset_index()
d1.columns = [["continent", "cyear", "counts"]]
d1 = d1.pivot(index="continent", columns="cyear", values="counts").fillna(0)
d1.to_csv("figures/"+fn_out1)

fn_out2 = fn_in[2:]+"_e_cnty_tbl.csv"
d1 = d0.groupby(["country", "cyear"]).size().to_frame().reset_index()
d1.columns = [["country", "cyear", "counts"]]
d1 = d1.pivot(index="country", columns="cyear", values="counts").fillna(0)
d1.to_csv("figures/"+fn_out2)

In [None]:
cnty_ls = list(set(d0["country"]))
cyears_ls = list(set(d0["cyear"]))
cyears_ls.sort()

print("Writing out initial temp files...", end="")
for cnty in cnty_ls:
    for cy in cyears_ls:
        d_temp = d0.loc[(d0["country"]==cnty) & (d0["cyear"]==cy)]
        fn_out = "temp_"+cnty+"_"+cy+"_"+str(len(d_temp))
        
        if len(d_temp) > 0:
            xio.write_fasta("temp/"+fn_out+".fas", d_temp, fas_cols, verbose=False)
print("Done!")

# cd-hit if more than 300
fn_ls = []
for fn in os.listdir("/users/dten0001/Google Drive/Dengue-Singapore-paper/temp"):
    if fn[:4] == "temp":
        fn_ls.append(fn)
fn_ls.sort()

fn_to_remove_ls = []
for fn in fn_ls:
    fn_prefix = fn.split(".")[0] # get rid of the '.fas'
    ncounts = int(fn_prefix.split("_")[-1])
    if ncounts > 300:
        fn_to_remove_ls.append(fn_prefix)
        
        cmd = "cd-hit -i temp/"+fn+" -o temp/"+fn_prefix+"_cdh.fas -c 1.0"
        print(cmd)
        subprocess.run(cmd, shell=True)
        
        n_records0 = fn_prefix.split("_")[-1]
        n_records1 = !grep -c "^>" temp/{fn_prefix}_cdh.fas
        print("Reduced from %s to %s" % (n_records0, n_records1[0]))

# Remove unwanted files
for fn_p in fn_to_remove_ls:
    cmd = "rm temp/"+fn_p+".fas"
    subprocess.run(cmd, shell=True)
    
    # retirer .clstr fichier
    cmd = "rm temp/"+fn_p+"_cdh.fas.clstr"
    subprocess.run(cmd, shell=True)
    
# cat the temp files
cmd = "cat temp/temp* > data/d1_cdh.fas"
subprocess.run(cmd, shell=True)
print("Writing out final output file: d1_cdh.fas")

# Remove all the rest of the temp files
for fn in os.listdir("/users/dten0001/Google Drive/Dengue-Singapore-paper/temp"):
    if fn[:4] == "temp":
        cmd = "rm temp/"+fn

In [13]:
wrp.spatiotemporal_subsample(d0, "country", "cyear", 
                             "data/d1_cdh.fas", "temp/", 
                             out_cols_ls=fas_cols,
                             cdh_th=300)

Writing out initial temp files...Done!
cd-hit -i temp/temp_China_2014_360.fas -o temp/temp_China_2014_360_cdh.fas -c 1.0
Reduced from 360 to 174
cd-hit -i temp/temp_Singapore_2013_674.fas -o temp/temp_Singapore_2013_674_cdh.fas -c 1.0
Reduced from 674 to 235
cd-hit -i temp/temp_Singapore_2014_878.fas -o temp/temp_Singapore_2014_878_cdh.fas -c 1.0
Reduced from 878 to 214
cd-hit -i temp/temp_Vietnam_2007_351.fas -o temp/temp_Vietnam_2007_351_cdh.fas -c 1.0
Reduced from 351 to 208
cd-hit -i temp/temp_Vietnam_2008_335.fas -o temp/temp_Vietnam_2008_335_cdh.fas -c 1.0
Reduced from 335 to 218
Removing unwanted files...Done!
Final no. of sequences = 
Wrote out final output to temp/d1_cdh.fas
