charades

#!/usr/bin/perl
use warnings;
use strict;
use Cwd;
use Getopt::Long;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";


# This script looks at the first 100,000 sequences from a bisulfite split fastq file and then
# tries to find out how the library was created. It will use sequence composition at different
# positions in the sequence.

my $debug;
my @files_to_analyse;
my @options;
my ($output_dir, $parent_dir, $project,$r_path, $weka_path,$training,$maverick, $force) = process_commandline();

open (my $log_file,'>',"${output_dir}${project}charades_log.txt") or die "Cannot write log file: $!";

log_file_basics();
files_to_analyse();
extract_100K_reads();
collect_base_composition_info();
maverick_library();
guess_library();
draw_graph();

close $log_file or die;


###################
### SUBROUTINES ###
###################


sub process_commandline {

	my $help;
	my $output_dir;
	my $project;
	my $r_path;
	my $weka_path;
	my $install_r_packages;
	my $training;
	my $maverick;
	my $force;
	
	my $command_line =  GetOptions ('help'                => \$help,
									'o|output_dir=s'      => \$output_dir,
									'p|project=s'		  => \$project,
									'r|r_path=s'		  => \$r_path,
									'w|weka_path=s'		  => \$weka_path,
									'install_r_packages'  => \$install_r_packages,
									't|training=s'		  => \$training,
									'm|maverick'		  => \$maverick,
									'f|force'			  => \$force,
									);
	
	
	die "Please respecify command line options\n\n" unless ($command_line);

	
	### RECREATING SPECIFIED COMMANDLINE OPTIONS
	
	if (defined $output_dir) {
		push @options,"--output_dir $output_dir";
	}
	if (defined $project){
		push @options,"--project $project";
	}
	
	if (defined $r_path) {
		push @options, "--r_path $r_path";
	}
	
	if (defined $weka_path) {
		push @options, "--weka_path $weka_path";
	}
	
	if (defined $training) {
		push @options, "--training $training";
	}
	
	if (defined $maverick) {
		push @options, "--maverick";
	}
	
	if (defined $force) {
		push @options, "--force";
	}
	
	
	### PRINT HELPFILE
    
	if ($help){
		print_helpfile();
		exit;
    }
	
	### INSTALL R PACKAGES
	
	if ($install_r_packages) {
		install_r_packages();
		exit;
	}
		
	die "\nPlease specifiy gzipped fastq files\n\n" unless (@ARGV);
	
	### PARENT DIRECTORY
	
    my $parent_dir = getcwd();
    unless ($parent_dir =~ /\/$/){    ## making parent directory end in a slash
		$parent_dir =~ s/$/\//;
    }

    ### OUTPUT DIRECTORY
	
    if (defined $output_dir){
		unless ($output_dir eq ''){
			unless ($output_dir =~ /\/$/){     ## making output directory end in a slash
			$output_dir =~ s/$/\//;
			}
	    
	    if (chdir $output_dir){
			$output_dir = getcwd(); #  making the path absolute
			unless ($output_dir =~ /\/$/){
				$output_dir =~ s/$/\//;
			}
	    }
	    
		else {
			mkdir $output_dir or die "Unable to create directory '$output_dir' $!\n";
			warn "\nCreating output directory '$output_dir' \n\n"; 
			chdir $output_dir or die "Failed to move to '$output_dir'\n";
			$output_dir = getcwd(); #  making the path absolute
			unless ($output_dir =~ /\/$/){
				$output_dir =~ s/$/\//;
			}
	    }
	    warn "\nOutput will be written into the directory: '$output_dir'\n";
		}
    }
    else {
		$output_dir = '';
    }

    # Changing back to parent directory
    chdir $parent_dir or die "Failed to move to '$parent_dir'\n";
	
		
	### CHECK R PATH
	
	if (defined $r_path) {
		if ((system "$r_path --version > /dev/null 2>&1") == 0) {
			warn "\nFound working R installation at '$r_path'\n";
		}
		else {
			die "No working R installation found at this path: '$r_path'.\nPlease respecify.\n\n";
		}
		
	}
	else {
		$r_path = `which R 2>/dev/null`;
		chomp $r_path;
		if ($r_path eq "") {
			warn "\nNo working R installation found. No graph will be generated.\n\n";
		}
		else {
			warn "\nFound R installation at '$r_path'\n";
		}
	
	}
	
	## Check for R packages
	## Charades needs readr, tidyr and ggplot2 for its graphical output
	

	my $r_packages = system "$r_path -e 'library(readr);library(tidyr);library(ggplot2)' -q --no-save > /dev/null 2>&1";
	#warn "This is what the check for packages returns: $r_packages\n";
	
	unless ($r_packages == 0) {
		warn "\nCharades is missing R packages to draw the probability heatmap.\nIt will run without but no graphical output will be generated.
		\nRun >>> charades --install_r_packages <<< to get the missing packages.\n";
	}
	

	### CHECK WEKA PATH
	
	## if the path to weka.jar is specified with the --weka_path option
	
	if (defined $weka_path) {
		if (-e $weka_path) {
			warn "\nFound weka.jar at '$weka_path'\n";
		}
		else {
			die "\nNo 'weka.jar' file found at this path: '$weka_path'.\nPlease respecify.\n\n";
		}
		
	}
	
	## if the path to weka.jar is not specified
	
	else {
		my $weka_folder = `which weka 2>/dev/null`;
		chomp $weka_folder;
		
		## weka.jar is sometimes found in the weka folder (weka installations)
		if (-e "${weka_folder}.jar") {
			$weka_path = ${weka_folder}.".jar";
			warn "\nFound weka.jar at '$weka_path'\n";
		}
		
		## weka.jar is also sometimes found here (ubuntu)
		elsif (-e "/usr/share/java/weka.jar") {
			$weka_path = "/usr/share/java/weka.jar";
			warn "\nFound weka.jar at '$weka_path'.\n";
		}
		else {
			die "\nCould not find file >>> weka.jar <<<. Please specify the location of this file using --weka_path\n\n";
		}
	}
	
	
	### PROJECT NAME
	
	if (defined $project) {
		$project = $project.".";
	}
	else {
		warn "\nNo project name defined, using generic output file names.\n";
		$project = '';
	}
	
	
	### TRAINING DATA
	
	if (defined $training) {
		if (-e "${RealBin}/${training}") {
			warn "\nFound training data file '$training'\n";
		}
		else {
			die "\nCouldn't find training data file '$training' in Charades directory. Exiting.\n\n\n";
		}
	}
	else {
		$training = "training_data_20190313.arff";
		warn "\nUsing default training data file '$training'\n";
	}
	
	unless ($training =~ /.arff$/) {
		die "Training data file must be in ARFF format";
	}
	
		
	return ($output_dir, $parent_dir, $project,$r_path, $weka_path,$training,$maverick, $force);
	
	
}


sub print_helpfile{
	
	print "\n\nCharades will try and predict which bisulfite sequencing library preparation was used. It uses (gzipped) fastq files as input.\n\n";
	print ">>> USAGE: ./charades.pl [options] filename(s) <<<\n\n";
	print "\nOptions:\n\n";
	print "--output_dir / -o [path]\tOutput directory, either relative or absolute. If not specified, output is written to the current directory.\n\n";
	print "--project / -p [project name]\tProject name for all tested samples which will be added to the output file names.\n\n";
	print "--r_path / -r [path]\t\tPath to the R installation. If not specified, Charades will attempt to find one.\n\n";
	print "--weka_path / -w [path]\t\tLocation of 'weka.jar'. If not specified, Charades will attempt to find it.\n\n";
	print "--training / -t [file] \t\tFile containing training data. Needs to be in the same directory as Charades. Default is provided.\n";
	print "\t\t\t\tDo NOT change light heartedly!\n\n";
	print "--maverick / -m\t\t\tBy default Charades will check if the sample looks anything like BSseq data. Use this option to skip this check,\n";
	print "\t\t\t\tbut be warned that any library (eg RNAseq, ChIPseq, ATACseq, WGS etc) will produce a BSseq library prediction.\n\n";
	print "--force / -f\t\t\tCharades will overwrite the directory 'charades_temp' in the output folder\n\n";
	print "--install_r_packages\t\tCharades will install R packages readr, tidyr and ggplot2.\n\n\n";
	
	print "Please report bugs to christel.krueger\@babraham.ac.uk\n\n";
}


sub log_file_basics {

	if (scalar @options == 0) {
		print $log_file "Using Charades with default options\n\n";
	}
	
	else {
		print $log_file "Command line options specified: ".join(" ",@options)."\n\n";
	}

	if ($project eq "") {
		print $log_file "No project name specified\n";
	}
	else {
		print $log_file "Project name: $project\n";
		}
		
	print $log_file "Path to R: $r_path\n";
	print $log_file "Path to Weka: $weka_path\n";
	print $log_file "Training data file: $training\n";

}


sub install_r_packages {
	
	## CHECK R PATH
	
	if (defined $r_path) {
		if ((system "$r_path --version > /dev/null 2>&1") == 0) {
			warn "\nFound working R installation at '$r_path'\n";
		}
		else {
			die "No working R installation found at this path: '$r_path'\nPlease respecify.\n\n";
		}
		
	}
	else {
		$r_path = `which R 2>/dev/null`;
		chomp $r_path;
		if ($r_path eq "") {
			warn "\nNo working R installation found. Please install R.\n\n";
		}
		else {
			warn "\nFound R installation at '$r_path'\n\n";
		}
	
	}
	
	
	# CREATE LOCAL LIBRARY
	
	warn "Creating local library\n\n";
	system "$r_path -e 'dir.create(Sys.getenv(\"R_LIBS_USER\"), showWarnings = FALSE, recursive = TRUE)' -q --no-save > /dev/null 2>&1";
	
	# INSTALL PACKAGES
	
	warn "Installing R packages\n\n";
	system "$r_path -e 'repository<-getOption(\"CRAN\",\"http://cloud.r-project.org\");install.packages(\"readr\",repos=repository);install.packages(\"tidyr\",repos=repository);install.packages(\"ggplot2\",repos=repository)'";
	
}


sub files_to_analyse {											

## Finding out which files to analyse

	my @files = @ARGV;
	unless (@files) {
		print "Please provide one or more fastq.gz / fastq files for analysis\n\n";
	}
	my $length = @files;
	warn "\nScanning $length input file(s).\n";
	warn join("\n",@files)."\n" if $debug;


	foreach my $file(@files) {
	
		unless (-e $file) {
			warn "'$file' does not exist. Skipping.\n";
			print $log_file "'$file' does not exist. Skipping\n";
			next;
		}
	
		if ($file =~ /R1/) {
			push @files_to_analyse, $file;
		}
		elsif ($file !~ /R2|R3|R4/) {
			push @files_to_analyse, $file;
		}
	}
	@files_to_analyse = sort(@files_to_analyse);
	
	if (scalar(@files_to_analyse) == 0) {
		print $log_file "No files selected for analysis.\n";
		die "\n\nNo files were selected for analysis. Exiting.\n\n";
	}
	
	warn "\nFiles selected for analysis are\n", join("\n",@files_to_analyse),"\n\n\n";
	print $log_file "\nFiles selected for analysis are\n", join("\n",@files_to_analyse),"\n\n";
}


sub extract_100K_reads {

	my @subfolders;
	
	if (defined $force and -e "${output_dir}charades_temp" and -d "${output_dir}charades_temp") {
		system "rm -r ${output_dir}charades_temp";
	}
	
	elsif (-e "${output_dir}charades_temp" and -d "${output_dir}charades_temp") {
		print $log_file "Directory >>> charades_temp <<< exists. Use option --force / -f to overwrite.\n\n";
		die "A folder called >>> charades_temp <<< exists in the output directory. Use option --force / -f to overwrite.\n\n";
	}
		
	
	mkdir "${output_dir}charades_temp" or die "Could not create temporary directory: $!";
	
	warn "\nExtracting the first 100,000 reads from fastq files\n\n";
	my $file_name;
	
	foreach my $file_to_analyse(@files_to_analyse) { 
	
		$file_name = $file_to_analyse;
		$file_name =~ s/^.*\///;
		my $in;
		
		if ($file_name =~ /.gz$/) {
			open ($in, "gunzip -c $file_to_analyse |") or die "Could not open file to analyse: $!";
		}
		else {
			open ($in, $file_to_analyse) or die "Could not open file to analyse: $!";
		}
		
		open (my $out,'>', "${output_dir}charades_temp/$file_name.100K.fastq") or die "Cannot write out first 100K reads: $!";
		my $count = 0;
		
		while (<$in>) {
			my $header = $_;
			my $sequence = <$in>;
			my $plusline = <$in>;
			my $quality_scores = <$in>;
			
			print $out "$header$sequence$plusline$quality_scores";
			++$count;
			
			if ($count == 1) {
				unless ($header =~ /^@/) {
					die "'$file_name' does not look like it's in fastq format. Exiting.\n\n";
				}
				unless ($sequence =~ /^[A|T|C|G|N]*$/) {
					die "Sequence in '$file_name' contains bases other than A, T, C, G, N. Exiting.\n\n";
				}
				unless ($plusline =~ /^\+/) {
					die "'$file_name' does not look like it's in fastq format. Exiting.\n\n";
				}
				unless ($quality_scores) {
					die "'$file_name' does not look like it's in fastq format. Exiting.\n\n";
				}
			}
			
			
			last if ($count == 100000);
		}
		
		#close $in or die; # dies when trying to close the filehandle from the pipe
		close $out or die "Problem writing out first 100K reads: $!";
	}
}


sub collect_base_composition_info {
	
	open (my $out, '>', "${output_dir}${project}sequence_composition_stats.csv") or die "Cannot write to file: $!";
	open (my $out_arff, '>', "${output_dir}${project}sequence_composition_stats.arff") or die "Cannot write to file: $!";
	print $out "file_name,pos1_A,pos1_C,pos1_T,pos1_G,pos2_A,pos2_C,pos2_T,pos2_G,pos3_A,pos3_C,pos3_T,pos3_G,pos4_A,pos4_C,pos4_T,pos4_G,pos8_A,pos8_C,pos8_T,",
				"pos8_G,pos9_A,pos9_C,pos9_T,pos9_G,pos10_A,pos10_C,pos10_T,pos10_G,pos11_A,pos11_C,pos11_T,pos11_G,pos12_A,pos12_C,pos12_T,pos12_G,",
				"pos20_A,pos20_C,pos20_T,pos20_G,pos30_A,pos30_C,pos30_T,pos30_G,method\n";
	#print $out "file_name\tposition\tpercentA\tpercentC\tpercentT\tpercentG\tpercentN\n";
	
	print $out_arff "\@relation base_signature\n
\@attribute pos1_A numeric
\@attribute pos1_C numeric
\@attribute pos1_T numeric
\@attribute pos1_G numeric
\@attribute pos2_A numeric
\@attribute pos2_C numeric
\@attribute pos2_T numeric
\@attribute pos2_G numeric
\@attribute pos3_A numeric
\@attribute pos3_C numeric
\@attribute pos3_T numeric
\@attribute pos3_G numeric
\@attribute pos4_A numeric
\@attribute pos4_C numeric
\@attribute pos4_T numeric
\@attribute pos4_G numeric
\@attribute pos8_A numeric
\@attribute pos8_C numeric
\@attribute pos8_T numeric
\@attribute pos8_G numeric
\@attribute pos9_A numeric
\@attribute pos9_C numeric
\@attribute pos9_T numeric
\@attribute pos9_G numeric
\@attribute pos10_A numeric
\@attribute pos10_C numeric
\@attribute pos10_T numeric
\@attribute pos10_G numeric
\@attribute pos11_A numeric
\@attribute pos11_C numeric
\@attribute pos11_T numeric
\@attribute pos11_G numeric
\@attribute pos12_A numeric
\@attribute pos12_C numeric
\@attribute pos12_T numeric
\@attribute pos12_G numeric
\@attribute pos20_A numeric
\@attribute pos20_C numeric
\@attribute pos20_T numeric
\@attribute pos20_G numeric
\@attribute pos30_A numeric
\@attribute pos30_C numeric
\@attribute pos30_T numeric
\@attribute pos30_G numeric
\@attribute method {Swift,6N_PBAT,6N_sc,9N_sc,WGBS,RRBS,scNOME,UMI_RRBS,9N_PBAT,Amplicon,NOMEseq,Truseq,RNA,ChIP,ATAC,WGS}\n\n
\@data\n";

	chdir "${output_dir}charades_temp" or die "Cannot move to temporary folder: $!";
	
	my @file_heads = <*.100K.fastq>;
	my %info;			## This is the data structure that contains all the base composition info (hash of array of hashes)

	unless (@file_heads) {
		die "Cannot find files for first 100,000 reads.Exiting.\n";
	}
	
	FILE: foreach my $file_head(@file_heads) {
		my $file_name = $file_head;
		warn $file_name, "\n" if $debug;
		my $remove = substr($file_name,-11);
		$file_name =~ s/$remove//;
		warn $file_name, "\n" if $debug;


		## Read sequences in the fastq file

		open (my $in, $file_head) or die "Cannot open fastq file: $! ";


		## Collecting all the reads from the fastq file.
		my @reads;
		warn "Summarising sequence composition for '$file_name'\n";

		while (<$in>) {
	
			my $header = $_;
			unless ($header =~ /^@/) {
				warn "'$file_head' does not look like it's in fastq format. Skipping.\n";
				next FILE;
			}
			my $sequence = <$in>;
			unless ($sequence =~ /^[A|T|C|G|N]*$/) {
				warn "Sequence in '$file_head' contains bases other than A, T, C, G, N. Skipping.\n";
				next FILE;
			}
			
			my $plusline = <$in>;
			unless ($plusline =~ /^\+/) {
				warn "'$file_head' does not look like it's in fastq format. Skipping.\n";
				next FILE;
			}
			my $quality_scores = <$in>;
			last unless ($quality_scores);
			chomp $sequence;
			$sequence = uc($sequence);
			if ($sequence =~ /^[A|T|C|G|N]*$/) {
				push @reads, $sequence;
				#warn "$reads[$#reads]\n";
			}
		}

		# Extracting the base composition at different positions in the read.
		# Base composition is very different between different library preps.
	
		my @interesting_positions = (1,2,3,4,8,9,10,11,12,20,30);
		
		my $As = 0;
		my $Cs = 0;
		my $Ts = 0;
		my $Gs = 0;
		my $Ns = 0;
		
	
		foreach my $interesting_position(@interesting_positions) {
			foreach my $read(@reads) {
				$read = uc($read);
		
				unless (length($read) > 30) {
					warn "A read in '$file_head' is shorter than expected with only ", length($read), " bases. 
					\nSkipping Sequence: ", $read, "\n";
					next;
				} # There shouldn't be any shorter reads unless the runs are really old.
			
				my $base = substr($read, $interesting_position -1,1);
				
				if ($base eq "A") {
					++ $As;
				}
				elsif ($base eq "C") {
					++$Cs;
				}
				elsif ($base eq "T") {
					++$Ts;
				}
				elsif ($base eq "G") {
					++$Gs;
				}
				elsif ($base eq "N") {
					++$Ns;
				}
				else {
					warn "Unexpected base found: $base Only A,C,T,G and N allowed\n";
				}
			}
			#warn "There were $As As, $Cs Cs, $Ts Ts, $Gs, Gs and $Ns other bases at position $interesting_position\n";
			
			if ($As+$Cs+$Ts+$Gs+$Ns == 0) {
				warn "File '$file_head' does not seem to contain a (long enough)sequence.\n";
				next FILE;
			}
	
			my $percentA = int($As / ($As+$Cs+$Ts+$Gs+$Ns) * 100);
			my $percentC = int($Cs / ($As+$Cs+$Ts+$Gs+$Ns) * 100);
			my $percentT = int($Ts / ($As+$Cs+$Ts+$Gs+$Ns) * 100);
			my $percentG = int($Gs / ($As+$Cs+$Ts+$Gs+$Ns) * 100);
			my $percentN = int($Ns / ($As+$Cs+$Ts+$Gs+$Ns) * 100);
	
			if ($percentN > 5) {
				warn "WARNING: There are more than 5% Ns at position $interesting_position.\n";
			}
			
			warn "Percentages for A,C,T,G at position $interesting_position are $percentA, $percentC, $percentG, $percentT\n" if $debug;
			#print $out "$file_name\t$interesting_position\t$percentA\t$percentC\t$percentT\t$percentG\t$percentN\n";
			
			## Generating a data structure containing the composition info
			my $position_info = 	{position => $interesting_position,
									 percentA => $percentA,
									 percentC => $percentC,
									 percentT => $percentT,
									 percentG => $percentG};
			
			push @{$info{$file_name}},$position_info;
			
			
			## resetting base counts
			$As = 0;
			$Cs = 0;
			$Ts = 0;
			$Gs = 0;
			$Ns = 0;	
		}
		close $in or die;
	}
	
	my @temp_files = (<*.100K.fastq>);
	foreach my $temp_file(@temp_files) {
		unlink $temp_file or die "Couldn't delete file: $!";
	}
	chdir $parent_dir or die "Cannot move away from temporary folder: $!";
	rmdir ("${output_dir}charades_temp") or die "Cannot delete temporary folder: $!";
	
	foreach my $file (sort keys %info) {
		print $out $file, ",", $info{$file} -> [0] -> {percentA}, ",",$info{$file} -> [0] -> {percentC},",",$info{$file} -> [0] -> {percentT},",",$info{$file} -> [0] -> {percentG},
					",",$info{$file} -> [1] -> {percentA},",",$info{$file} -> [1] -> {percentC},",",$info{$file} -> [1] -> {percentT},",",$info{$file} -> [1] -> {percentG},
					",",$info{$file} -> [2] -> {percentA},",",$info{$file} -> [2] -> {percentC},",",$info{$file} -> [2] -> {percentT},",",$info{$file} -> [2] -> {percentG},
					",",$info{$file} -> [3] -> {percentA},",",$info{$file} -> [3] -> {percentC},",",$info{$file} -> [3] -> {percentT},",",$info{$file} -> [3] -> {percentG},
					",",$info{$file} -> [4] -> {percentA},",",$info{$file} -> [4] -> {percentC},",",$info{$file} -> [4] -> {percentT},",",$info{$file} -> [4] -> {percentG},
					",",$info{$file} -> [5] -> {percentA},",",$info{$file} -> [5] -> {percentC},",",$info{$file} -> [5] -> {percentT},",",$info{$file} -> [5] -> {percentG},
					",",$info{$file} -> [6] -> {percentA},",",$info{$file} -> [6] -> {percentC},",",$info{$file} -> [6] -> {percentT},",",$info{$file} -> [6] -> {percentG},
					",",$info{$file} -> [7] -> {percentA},",",$info{$file} -> [7] -> {percentC},",",$info{$file} -> [7] -> {percentT},",",$info{$file} -> [7] -> {percentG},
					",",$info{$file} -> [8] -> {percentA},",",$info{$file} -> [8] -> {percentC},",",$info{$file} -> [8] -> {percentT},",",$info{$file} -> [8] -> {percentG},
					",",$info{$file} -> [9] -> {percentA},",",$info{$file} -> [9] -> {percentC},",",$info{$file} -> [9] -> {percentT},",",$info{$file} -> [9] -> {percentG},
					",",$info{$file} -> [10] -> {percentA},",",$info{$file} -> [10] -> {percentC},",",$info{$file} -> [10] -> {percentT},",",$info{$file} -> [10] -> {percentG}, ",?\n";
		print $out_arff $info{$file} -> [0] -> {percentA}, ",",$info{$file} -> [0] -> {percentC},",",$info{$file} -> [0] -> {percentT},",",$info{$file} -> [0] -> {percentG},
					",",$info{$file} -> [1] -> {percentA},",",$info{$file} -> [1] -> {percentC},",",$info{$file} -> [1] -> {percentT},",",$info{$file} -> [1] -> {percentG},
					",",$info{$file} -> [2] -> {percentA},",",$info{$file} -> [2] -> {percentC},",",$info{$file} -> [2] -> {percentT},",",$info{$file} -> [2] -> {percentG},
					",",$info{$file} -> [3] -> {percentA},",",$info{$file} -> [3] -> {percentC},",",$info{$file} -> [3] -> {percentT},",",$info{$file} -> [3] -> {percentG},
					",",$info{$file} -> [4] -> {percentA},",",$info{$file} -> [4] -> {percentC},",",$info{$file} -> [4] -> {percentT},",",$info{$file} -> [4] -> {percentG},
					",",$info{$file} -> [5] -> {percentA},",",$info{$file} -> [5] -> {percentC},",",$info{$file} -> [5] -> {percentT},",",$info{$file} -> [5] -> {percentG},
					",",$info{$file} -> [6] -> {percentA},",",$info{$file} -> [6] -> {percentC},",",$info{$file} -> [6] -> {percentT},",",$info{$file} -> [6] -> {percentG},
					",",$info{$file} -> [7] -> {percentA},",",$info{$file} -> [7] -> {percentC},",",$info{$file} -> [7] -> {percentT},",",$info{$file} -> [7] -> {percentG},
					",",$info{$file} -> [8] -> {percentA},",",$info{$file} -> [8] -> {percentC},",",$info{$file} -> [8] -> {percentT},",",$info{$file} -> [8] -> {percentG},
					",",$info{$file} -> [9] -> {percentA},",",$info{$file} -> [9] -> {percentC},",",$info{$file} -> [9] -> {percentT},",",$info{$file} -> [9] -> {percentG},
					",",$info{$file} -> [10] -> {percentA},",",$info{$file} -> [10] -> {percentC},",",$info{$file} -> [10] -> {percentT},",",$info{$file} -> [10] -> {percentG}, ",?\n";
	}
	
	close $out or die "Could not close output file:$!\n";
	close $out_arff or die "Could not close output file:$!\n";
	print $log_file "Collected base composition information\n";
}	


sub maverick_library {

	unless (defined $maverick) {
		warn "\nChecking for non-bisulfite libraries\n\n";
		
		## Call Weka
		system "java -cp '${weka_path}' weka.classifiers.functions.Logistic -R 1 -t $RealBin/training_data_20190313_including_non_bisulfite.arff -T ${output_dir}${project}sequence_composition_stats.arff -distribution -p 0 > ${output_dir}weka.maverick";
		
		open (my $in,"${output_dir}weka.maverick") or die "Cannot open Weka maverick output file: $!";
	
		my $count = 0;
	
		
		while(<$in>) {
			unless ($_) {
				warn "No sequence composition data was collected. Exiting.\n";
			}
			if ($_ =~ /\s+inst#/) {
				my $header = $_;
				my $i = 0;
				my $sample;
			
				while (<$in>) {
					my $prediction = $_;
					warn $prediction, "\n" if $debug;
					last unless ($prediction =~ /\S+/);			## removes the empty line at the end of the file
					my (undef,undef,undef, $class, $probabilities) = split(/\s+/,$prediction);
					$class =~ s/^\d+://;
				
					# remove the path from the file name
					$sample = $files_to_analyse[$i];
					$sample =~ s/^.*\///;
				
					if ($class =~ /ATAC|ChIP|RNA|WGS/) {
						warn "$sample looks more like $class-seq than a bisulfite library.\n";
						++$count;
					}
					++$i;
				}
			
			}
		}
		close $in;
		unlink("${output_dir}weka.maverick") if (-e "${output_dir}weka.maverick");
	
		if ($count == 0) {
			print "None found\n";
			print $log_file "No non-bisulfite libraries detected\n";
		}
		else {
			warn "\n\nSome samples ($count) don't look like they are bisulfite libraries.Exiting.\n";
			print $log_file "Some samples ($count) don't look like they are bisulfite libraries. Exiting.\n";
			print $log_file "If you want to run the predictions anyway, use the --maverick / -m option to override.\n\n";;
			warn "If you want to run the predictions anyway, use the --maverick / -m option to override.\n";
			warn "Be warned though that Charades will then always predict a bisulfite library type (even if it is, say, RNAseq).\n";
			die "\n";
		}
	}

}


sub guess_library {
	
	warn "\n\nUsing Weka Logistic Regression classifier to predict library method\n\n";
	unless (-e "${RealBin}\/${training}") {
		print $log_file "Could not find training data.\n";
		die "Couldn't find file '$training'. Please add it to the same directory as charades.\n\n\n";
	}
	

	## Call Weka
	my $weka_call = system "java -cp '${weka_path}' weka.classifiers.functions.Logistic -R 1 -t $RealBin/$training -T ${output_dir}${project}sequence_composition_stats.arff -distribution -p 0 > ${output_dir}weka.output";
	
	# Checking if the weka output file contains data
	my $count = 0;
		open (my $input, "${output_dir}weka.output") or die "Couldn't open weka.output: $!";
	
		while (<$input>) {
			chomp;
			++$count;
		}
	
	close $input or die "Couldn't close filehandle to weka.output";
	
	if (($weka_call == 0) and ($count != 0)) {
		print "Weka classification successful\n\n";
		print $log_file "Weka classification successful\n\n";
	}
	else {
		print $log_file "Weka classification unsuccessful\n";
		die "Weka classification unsuccessful: $!";
	}

	open (my $in,"${output_dir}weka.output") or die "Cannot open Weka output file: $!";
	open (my $out, '>', "${output_dir}${project}class.probabilities.txt") or die "Cannot create class probability file: $!";
	
	## making the header for the class probability file which is then to be used for the graphical representation
	print $out "sample\tSwift\tPBAT_6N\tsc_6N\tsc_9N\tWGBS\tRRBS\tscNOME\tUMI_RRBS\tPBAT_9N\tAmplicon\tNOMEseq\tTruseq\n";
	print "\nPredictions\n-------------------------------------------------------\n\nSample\tLibrary\n\n";
	print $log_file "\nPredictions\n-------------------------------------------------------\n\nSample\tLibrary\n\n";
	
	while(<$in>) {
		unless ($_) {
			warn "No sequence composition data was collected. Exiting.\n";
		}
		if ($_ =~ /\s+inst#/) {
			my $header = $_;
			my $i = 0;
			my $sample;
			
			while (<$in>) {
				my $prediction = $_;
				warn $prediction, "\n" if $debug;
				last unless ($prediction =~ /\S+/);			## removes the empty line at the end of the file
				my (undef,undef,undef, $class, $probabilities) = split(/\s+/,$prediction);
				$class =~ s/^\d+://;
				my ($Swift,$PBAT_6N,$sc_6N,$sc_9N,$WGBS,$RRBS,$scNOME,$UMI_RRBS,$PBAT_9N,$Amplicon,$NOMEseq,$Truseq) = split(/,/,$probabilities);
				#{Swift,6N_PBAT,6N_sc,9N_sc,WGBS,RRBS,scNOME,UMI_RRBS,9N_PBAT,Amplicon,NOMEseq,Truseq}		
				# THIS NEEDS TO BE THE SAME ORDER AS THE CLASS ATTRIBUTE IN THE ARFF FILE!!
				
				# removing the star from the predicted class to make them all numeric
				foreach my $probability($Swift,$PBAT_6N,$sc_6N,$sc_9N,$WGBS,$RRBS,$scNOME,$UMI_RRBS,$PBAT_9N,$Amplicon,$NOMEseq,$Truseq) {
					$probability =~ s/\*//;
				}
				
				
				# remove the path from the file name
				$sample = $files_to_analyse[$i];
				$sample =~ s/^.*\///;
				
				print $sample."\t".$class."\n";
				print $log_file $sample."\t".$class."\n";
				print $out "$sample\t$Swift\t$PBAT_6N\t$sc_6N\t$sc_9N\t$WGBS\t$RRBS\t$scNOME\t$UMI_RRBS\t$PBAT_9N\t$Amplicon\t$NOMEseq\t$Truseq\n";

				++$i;
			}
			
		}
	}
	print "\n\n";
	close $in;
	close $out or die "Could not close filehandle for class probabilities";
}


sub draw_graph {
	
	if ($r_path eq "") {
		warn "\n\nNo graphical output generated. Please set an R path if you want a probability heatmap.\nA path to R can be set using the option --r_path\n\n";	
		print $log_file "R not found. No probability heatmap generated\n";
		exit;
	}
	else {

		my $infile = "${output_dir}${project}class.probabilities.txt";
		my $graph = "${project}probabilities.png";
	
		if ($output_dir eq "") {
			$output_dir = getcwd();
		}
	
		unless ($output_dir =~ /\/$/){
			$output_dir =~ s/$/\//;
		}
	
		my $r_script = <<"END_SCRIPT";
	
			directory <- "$output_dir";
			file <- "$infile";
			graph <- "$graph";
		
			setwd(directory)
		
			library(readr)
			library(tidyr)
			library(ggplot2)

				read_tsv(file) -> probabilities
			
				gather(probabilities,prediction, probability, 2:13) -> probabilities.long
				graph <- ggplot(probabilities.long,aes(prediction, sample)) + geom_tile(aes(fill = probability)) + scale_fill_gradient (low = "white", high = "steelblue") +
				labs(y = "sample", x = "prediction") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
				ggsave("$graph")
		
		
END_SCRIPT
		
	
		open (my $filehandle_to_R, "| \"$r_path\" --no-save > /dev/null 2>&1") or die "Can't open pipe to '$r_path'";
#		open (my $filehandle_to_R, "| \"$r_path\" --no-save ") or die "Can't open pipe to '$r_path'";

		print $filehandle_to_R $r_script;

		close $filehandle_to_R or die "Can't write to '$r_path' pipe";
		unlink("${output_dir}Rplots.pdf") if (-e "${output_dir}Rplots.pdf");
	}
	
}