# <span style='color:deeppink'> Preparing gene and transposon expression files for analysis. <span\>

- The inputs for this script are mapped RNA reads, that were assigned to genes with featureCounts.
- The files will be processed with edgeR from bioconductor to standardise by number of reads.

Steps:
1. Manually run edgeR for each gene and exon file for each genome (p,h and ph)
2. Rename file headings and convert to tsv and get UG info only :)

In [None]:
# import modules
import rpy2
%load_ext rpy2.ipython
import os
import pandas as pd
import glob
import numpy as np
import pprint

### <span style='color:darkorchid'> 1. Manually run edgeR for each gene and exon file for each genome (p,h and ph) <span\>

In [12]:
%%bash
WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR # these 3 scripts simply ran the script for each gene file

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  17199 109 


Loading required package: limma
Error in .isAllZero(counts) : counts must be positive finite values
Calls: DGEList -> .isAllZero
Execution halted


In [13]:
%%bash
WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  17199 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"

Loading required package: limma


In [14]:
%%bash
WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  19348 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"

Loading required package: limma


In [15]:
%%bash
WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  36547 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"

Loading required package: limma


In [16]:
%%bash
WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR
cd /home/anjuni/analysis/rna_counts # these 3 scripts run the script for the exon files and rename the output file names
for x in *.csv;do mv ${x} Pst_104E_v13_h_exon_${x}; echo ${x};done

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  17199 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"featureCounts_gene_lengths.csv
featureCounts_matrix.csv
repCpmMatrix_featureCounts.csv
repRpkmMatrix_featureCounts.csv


Loading required package: limma


In [17]:
%%bash

WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR
cd /home/anjuni/analysis/rna_counts
for x in *.csv;do mv ${x} Pst_104E_v13_ph_exon_${x}; echo ${x};done

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  36547 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"featureCounts_gene_lengths.csv
featureCounts_matrix.csv
repCpmMatrix_featureCounts.csv
repRpkmMatrix_featureCounts.csv


Loading required package: limma


In [18]:
%%bash

WORKDIR=/home/anjuni/analysis/rna_counts
Rscript normalize.R $WORKDIR
cd /home/anjuni/analysis/rna_counts
for x in *.csv;do mv ${x} Pst_104E_v13_p_exon_${x}; echo ${x};done

Setting WORKDIR to: /home/anjuni/analysis/rna_counts 
Finished reading featureCount into data.frame with shape:  19348 19 

CPM matrix file written to "repCpmMatrix_featureCounts.csv"
RPKM matrix written to "repRpkmMatrix_featureCounts.csv"featureCounts_gene_lengths.csv
featureCounts_matrix.csv
repCpmMatrix_featureCounts.csv
repRpkmMatrix_featureCounts.csv


Loading required package: limma


After this step, all the files were moved to the working directory: /home/anjuni/analysis/rna_counts

### <span style='color:crimson'> 2. Rename file headings and convert to tsv and get UG info only :) <span/>

In [241]:
#set filepaths
DIRS = {}
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['RNA'] = os.path.join(DIRS['BASE2'], 'rna_counts')
DIRS['edgeR'] = os.path.join(DIRS['RNA'], 'edgeR_output')
DIRS['TRIALS'] = os.path.join(DIRS['RNA'], 'trials_tsv')
DIRS['MEAN_STD'] = os.path.join(DIRS['RNA'], 'average_and_stdev')

In [172]:
# Make a list of RNA-seq csv files
rna_list = sorted([fn for fn in glob.iglob('%s/*featureCounts.csv' % DIRS['edgeR'], recursive=True)])

In [173]:
rna_list

['/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_h_exon_repCpmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_h_exon_repRpkmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_h_gene_repCpmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_p_exon_repCpmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_p_exon_repRpkmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_p_gene_repCpmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_ph_exon_repCpmMatrix_featureCounts.csv',
 '/home/anjuni/analysis/rna_counts/edgeR_output/Pst_104E_v13_ph_exon_repRpkmMatrix_fea

In [47]:
headings = ['gene_ID', 'GS_1', 'GS_2', 'GS_3', 'HE_1', 'HE_2', 'HE_3', 'IT0_1', 'IT0_2', 'IT0_3', 'IT6_1', 'IT6_2', 'IT6_3', 'IT9_1', 'IT9_2', 'IT9_3', 'UG_1', 'UG_2', 'UG_3',]

Column indices
- GS: [1,2,3]
- HE: [4,5,6]
- IT0: [7,8,9]
- IT6: [10,11,12]
- IT9: [13,14,15]
- UG: [16,17,18]

In [77]:
#write a function to save the csv files as tsv files with proper headings
def csv_to_tsv(csv_list, header_list):
    """This is a function to take the edgeR output csv files, change the headings, and save out as a tsv file."""
    for file in csv_list:
        df = pd.read_csv(file, header = 0, names=header_list)
        out_fn1 = file.replace('/edgeR_output/', '/')
        out_fn1 = out_fn1.replace('.csv', '.tsv')
        df.to_csv(out_fn1, header=True, index=None, sep='\t') # save out the whole df to tsv
        # now save out just the UG columns
        out_fn2 = file.replace('/edgeR_output/', '/')
        out_fn2 = out_fn2.replace('.csv', '.UG.tsv')
        df.iloc[:, [0, 16, 17, 18]].to_csv(out_fn2, header=True, index=None, sep='\t')

In [None]:
csv_to_tsv(rna_list, headings) # run the function to save everything out as tsv!! :)

In [199]:
# Make a list of RNA-seq tsv files (called trial_list because they have the three trials)
rpkm_list = sorted([fn for fn in glob.iglob('%s/*_repRpkmMatrix_featureCounts.tsv' % DIRS['TRIALS'], recursive=True)])

In [203]:
rpkm_list

['/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_h_exon_repRpkmMatrix_featureCounts.tsv',
 '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.tsv',
 '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_p_exon_repRpkmMatrix_featureCounts.tsv',
 '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.tsv',
 '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_ph_exon_repRpkmMatrix_featureCounts.tsv',
 '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_ph_gene_repRpkmMatrix_featureCounts.tsv']

In [None]:
trial_dict = {}
trial_dict['GS'] = [1,2,3]
trial_dict['HE'] = [4,5,6]
trial_dict['IT0'] = [7,8,9]
trial_dict['IT6'] = [10,11,12]
trial_dict['IT9'] = [13,14,15]
trial_dict['UG'] = [16,17,18]

In [204]:
trial_dict

{'GS': [1, 2, 3],
 'HE': [4, 5, 6],
 'IT0': [7, 8, 9],
 'IT6': [10, 11, 12],
 'IT9': [13, 14, 15],
 'UG': [16, 17, 18]}

In [215]:
def average_runs(file, rep_dict): # rep_dict = dictionary of repeats for each condition
    """Makes dictionary of lists of averages of the three trials for RNA-seq data. Takes the list of csv's and columns of repeat trials as inputs."""
    average_dict = {}
    stdev_dict = {}
    df = pd.read_csv(file, header = 0, sep='\t')
    for key, value in rep_dict.items():
        average_list = []
        stdev_list = []
        gene_id_list = []
        for index, row in df.iterrows(): # calculate average and stdev for each 
            gene_id = row[0]
            gene_id_list.append(gene_id)
            average = ( row[rep_dict[key][0]] + row[rep_dict[key][1]] + row[rep_dict[key][2]] ) / 3
            average_list.append(average)
            stdev = np.std( [ row[rep_dict[key][0]], row[rep_dict[key][1]], row[rep_dict[key][2]] ] )
            stdev_list.append(stdev)
        average_dict[key] = average_list
        average_dict['gene_ID'] = gene_id_list
        stdev_dict[key] = stdev_list
        stdev_dict['gene_ID'] = gene_id_list
    return average_dict, stdev_dict

In [216]:
average_h_exon, stdev_h_exon = average_runs(rpkm_list[0], trial_dict)
average_h_gene, stdev_h_gene = average_runs(rpkm_list[1], trial_dict)
average_p_exon, stdev_p_exon = average_runs(rpkm_list[2], trial_dict)
average_p_gene, stdev_p_gene = average_runs(rpkm_list[3], trial_dict)
average_ph_exon, stdev_ph_exon = average_runs(rpkm_list[4], trial_dict)
average_ph_gene, stdev_ph_gene = average_runs(rpkm_list[5], trial_dict)

In [222]:
print(*average_ph_exon)

GS gene_ID HE IT0 IT6 IT9 UG


In [239]:
def dict_to_tsv(df_dict, out_fn):
    """This is a function to save the dictionaries from the previous section into tsv files."""
    # df1 saves all the columns
    df = pd.DataFrame.from_dict(df_dict)
    df1 = df [['gene_ID', 'GS', 'HE', 'IT0', 'IT6', 'IT9','UG']]
    df1.to_csv(os.path.join(DIRS['MEAN_STD'], out_fn), header=True, index=None, sep='\t')
    # df2 saves the UG (ungerminated spore) column)
    df2 = df [['gene_ID', 'UG']]
    out_ug = out_fn.replace('.tsv', '.UG.tsv')
    df2.to_csv(os.path.join(DIRS['MEAN_STD'], out_ug), header=True, index=None, sep='\t')
    # df3 sorts the UG column and saves it
    df3 = df2.sort_values('UG', ascending=False)
    out_ug_sorted = out_ug.replace('.tsv', '.sorted.tsv')
    df3.to_csv(os.path.join(DIRS['MEAN_STD'], out_ug_sorted), header=True, index=None, sep='\t')

In [240]:
# save everything to a tsv
dict_to_tsv(average_ph_gene, 'Pst_104E_v13_ph_gene_rpkm_average.tsv')
dict_to_tsv(average_ph_exon, 'Pst_104E_v13_ph_exon_rpkm_average.tsv')
dict_to_tsv(average_p_gene, 'Pst_104E_v13_p_gene_rpkm_average.tsv')
dict_to_tsv(average_p_exon, 'Pst_104E_v13_p_exon_rpkm_average.tsv')
dict_to_tsv(average_h_gene, 'Pst_104E_v13_h_gene_rpkm_average.tsv')
dict_to_tsv(average_h_exon, 'Pst_104E_v13_h_exon_rpkm_average.tsv')

dict_to_tsv(stdev_ph_gene, 'Pst_104E_v13_ph_gene_rpkm_stdev.tsv')
dict_to_tsv(stdev_ph_exon, 'Pst_104E_v13_ph_exon_rpkm_stdev.tsv')
dict_to_tsv(stdev_p_gene, 'Pst_104E_v13_p_gene_rpkm_stdev.tsv')
dict_to_tsv(stdev_p_exon, 'Pst_104E_v13_p_exon_rpkm_stdev.tsv')
dict_to_tsv(stdev_h_gene, 'Pst_104E_v13_h_gene_rpkm_stdev.tsv')
dict_to_tsv(stdev_h_exon, 'Pst_104E_v13_h_exon_rpkm_stdev.tsv')