## Notebook for making figures made with Python in 'RNA viral communities are structured by host plant phylogeny in oak and conifer leaves'
Anneliek ter Horst

- Viral abundance with dendrogram
- Number of viral contigs vs clean sequencing reads

In [2]:
# import 
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
from numpy import inf
import math
from scipy.stats import ranksums
import scipy.spatial as sp, scipy.cluster.hierarchy as hc
import matplotlib.pylab as plt
import re

In [None]:
# Viral abundance with dendrogram
# read table with presence-absence data
abundance_df = pd.read_csv('210615_coverM_0_1.csv')

# Replace nan with 0
abundance_df = abundance_df.fillna(0)

# Set contig as index instead of numbers
abundance_df = abundance_df.set_index('Contig')

# set figure params
sns.set(font_scale=1.5) 

# transpose df
df = abundance_df.transpose()

# Draw the full plot
# Compute pairwise correlation of columns, with df.corr()
g = sns.clustermap(abundance_df.corr(), cmap="YlGnBu", center=0,
                   dendrogram_ratio=(.2, .1),
                   cbar_pos=(.02, .32, .03, .2),
                   linewidths=.75, figsize=(12, 13), xticklabels=False)

for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(1)

for a in g.ax_col_dendrogram.collections:
    a.set_linewidth(1)
    
# save fig
g.savefig('viral_abundance_heatmap_trees_noQd.pdf')



In [None]:
# Number of viral contigs vs clean sequencing reads
# open the coverage table
df = pd.read_csv('viral_mapping_abundance.csv',sep=',')
df.head()

# Make the numbers smaller, (so that we can put 1e5 in the label)
df['num_reads'] = df['absolute_num_reads']/100000

# Sort by total number of reads
df = df.sort_values(by=['absolute_num_reads'])

# Barchart, stacked for num viral reads over all reads and number of vOTUs recovered
HE = 0.9
lefts=0

f = plt.figure()
# plot the bars 

# Plot number of reads per sample
plt.barh(df.sci_name_short, df.num_reads, align='center', color = '#bdbdbd') 


# Plot number of recovered viral contigs per samples the other way
plt.barh(df.sci_name_short, -df.num_vir_rec, align='center', color = '#525252') 

# Set x and y axes names
plt.ylabel("Tree species") 
plt.xlabel("Number of viral species per host | number of clean sequencing reads 1e5") 


plt.show()

# Save the figure
f.savefig("foo.pdf", bbox_inches='tight')