Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
547 lines (440 sloc) 14 KB
#!/usr/bin/perl
use strict;
use File::Temp;
use Getopt::Long;
Getopt::Long::Configure qw(pass_through no_ignore_case);
$SIG{INT} = \&sigint;
$SIG{TERM} = \&sigterm;
my @argv = @ARGV;
# key1, key2, delimiter, keep
my ($q, $k1, $k2) = ('', 1, 1);
my ($d, $fp, $overwrite, $keep, $presort, $make, $n, $header, $prefix, $multi, $dup, $debug, $uniq);
my ($hf1, $hf2);
my $kalt;
# these apply to the whole shebang
Getopt::Long::Configure qw(permute);
my @PERMUTE_OPTIONS = ("x"=>\$multi, "debug|D"=>\$debug, "K"=>\$keep, "x"=>\$multi, "m"=>\$make, "o"=>\$overwrite);
GetOptions(@PERMUTE_OPTIONS) || die usage();
# these apply to a file
Getopt::Long::Configure qw(require_order);
my @ORDERED_OPTIONS = ("k=i"=>\$kalt, "1=i"=>\$k1, "2=i"=>\$k2, "q=s"=>\$q, "d=s"=>\$d, "f"=>\$fp, "s:i"=>\$presort, "p"=>\$prefix, "n"=>\$n, "h:i"=>\$header, "u:i"=>\$uniq, "r=i"=>\$dup);
GetOptions(@ORDERED_OPTIONS) || die usage();
my $op = shift @ARGV;
$op=~s/^://;
Getopt::Long::Configure qw(no_pass_through);
GetOptions(@ORDERED_OPTIONS) || die usage();
my $f1 = shift @ARGV;
my $f2 = shift @ARGV;
# remaining options, arguments are stored!
my @fx = @ARGV;
$k1=$k2=$kalt if ($kalt);
# if file number is not set, then operate on both input files
$header = 3 if defined($header) && $header==0;
$uniq = 3 if defined($uniq) && $uniq==0;
$header = 0 if defined($header) && $header==-1;
# invert the option
# defined? sort the other, defined and zero? sort both (3-0 = 3)
my $dosort = 3-(0+(defined $presort ? ($presort==0?3:0) : 0));
my $of1=$f1;
my $of2=$f2;
die "Option -k doesn't make sense with -s or multifile" if (!$dosort && !@fx && !$multi) && $keep;
die usage() unless $op =~ /^inn?e?r?|exc?l?u?d?e?|xle?f?t?|out?e?r?$|xri?g?h?t?|lef?t?|rig?h?t?/;
die usage() if !$f2;
$op = substr($op, 0, 2);
my @tmps;
my $tmerg;
if (@fx) {
$tmerg = "$f2.tmerg";
open(OUT, ">$tmerg") || die "Can't create $f2.tmerg\n";
} else {
*OUT=*STDOUT;
}
if ($multi && $f1 =~ /\.tmerg$/) {
push @tmps, $f1;
}
my $sortop;
$sortop .= "-t '$d'" if $d;
$sortop .= '-n' if $n;
my ($f1hashead, $f2hashead);
($f1, $hf1, $f1hashead) = preproc_file($f1, $k1, $header == 3 || $header == 1, $dosort == 3 || $dosort == 1);
($f2, $hf2, $f2hashead) = preproc_file($f2, $k2, $header == 3 || $header == 2, $dosort == 3 || $dosort == 2);
warn "# f1:$f1, f2: $f2\n" if $debug;
$d = "\t" if !defined $d;
open(I1, $f1) || die "$f1: $!\n";
open(I2, $f2) || die "$f2: $!\n";
my (@B1, @B2, $l);
# buffer in
push @B1, scalar <I1>;
push @B2, scalar <I2>;
# create "undef" rows for outer/left joins
# left side is an empty array, with room for the key field
my $l=$B1[0]; chomp $l;
my @udt1 = split(/$d/, $l);
grep {$_=""} @udt1;
# right side is all delimiter
$l=$B2[0]; chomp $l;
my @t = split(/$d/, $l);
splice @t, $k2, 1; # don't replicate key field
my $ud2 = $d x ((scalar @t) - 1); # undef line 2
if ($f1hashead) {
# when you sort, the header gets stripped
warn "# discard row 1 file 1\n" if $debug;
$hf1=$B1[0]; chomp $hf1;
@B1=();
}
if ($f2hashead) {
# when you sort, the header gets stripped
$hf2=$B2[0]; chomp $hf2;
warn "# discard row 1 file 2\n" if $debug;
@B2=();
}
if ($debug) {
my $nm = $ud2 =~ s/$d/$d/g;
warn "# UD2: --$nm delims--\n";
}
$k1 -= 1;
$k2 -= 1;
my $outhead = 1;
$outhead=0 if $op eq 'ex';
$outhead=1 if $tmerg;
if ($outhead && $header && ($hf1 || $hf2)) {
# header line handline
# header on one side? mock up the other
if ($header==1) {
$hf2=$ud2;
} elsif ($header == 2) {
my $ud1 = join "\t", @udt1;
$hf1=$ud1;
}
# pull out key field from header when pasting
if ($op !~ /^xl|xr|ex$/) {
my @t = split(/$d/, $hf2);
splice @t, $k2, 1;
$hf2 = join $d, @t;
}
if ($prefix) {
# paste only
if ($op =~ /^ex|xr|xl/) {
die "Option -p makes no sense for 'ex', 'xr' or 'xl'" . ($op eq 'ex' ? ", maybe you mean -f" : "") . "\n";
}
my $p1 = $f1;
my $p2 = $f2;
# remove path and file extension
$p1 =~ s/.*\///;
$p2 =~ s/.*\///;
$p1 =~ s/\..*$//;
$p2 =~ s/\..*$//;
if (!$multi) {
my $cnt=0;
if ($op !~ /^ex|xr|xl/) {
$hf1 = join $d, map {s/^/$p1\// if $cnt++ != $k1; $_;} split(/$d/, $hf1);
}
}
if ($op !~ /^ex|xr|xl/) {
$hf2 = join $d, map {s/^/$p2\//; $_;} split(/$d/, $hf2);
}
}
# exclusive? no header for other file
$hf2="" if ($op =~ /^xl$/);
$hf1="" if ($op =~ /^xr|ex$/);
# tab if needed
$hf2="\t" . $hf2 if $hf2&&$hf1;
my $h = "$hf1$hf2\n";
if ($fp) {
$h="$d$h";
}
print OUT $h;
}
if ($fp && !($op eq 'ex')) {
die "Option -f only makes sense with ex\n";
}
my ($l1, $l2, $v1, $v2, @t, $cnt);
my ($pv1, $pv2, $pl1, $pl2); # previous values
my ($fp1); # file prefix 1
while(1) {
++$cnt;
if (! defined $l1) {
$l1 = @B1 ? shift @B1 : <I1>;
if ($multi && $fp && defined $l1) {
$l1 =~ s/^([^$d]+)$d//;
$fp1=$1;
} else {
$fp1=$f1;
}
# left join or inner... stop if run out of left file
last if ! defined $l1 && ( $op eq 'in' || $op eq 'xl' || $op eq 'le' );
if (defined $l1) {
chomp $l1;
# remove quote chars... this is DUMB... use quoted delim parser Regex::Common
@t = split(/$d/, $l1);
if ($q) {
grep s/^$q(.*)$q$/$1/, @t;
}
# key field for file 1
$v1 = $t[$k1];
} else {
$v1 = undef;
}
}
if (! defined $l2) {
# inner join or right join... stop if run out of right file
$l2 = @B2 ? shift @B2 : <I2>;
last if ! defined $l2 && ( $op eq 'in' || $op eq 'xr' || $op eq 'ri' );
if (defined $l2) {
chomp $l2;
# remove quote chars... this is DUMB... use quoted delim parser Regex::Common
@t = split(/$d/, $l2);
if ($q) {
grep s/^$q(.*)$q$/$1/, @t;
}
# key field for file 2
$v2 = $t[$k2];
if ($op !~ /^ex|xr|xl/) {
splice @t, $k2, 1; # don't replicate key field
$l2 = join $d, @t;
}
} else {
$v2 = undef;
}
}
# both empty? done.
last if ! defined $l2 && ! defined $l1;
# numeric sort?
my $c = $n ? ($v1 <=> $v2) : ($v1 cmp $v2);
# undef is always "less than"
$c = 1 if ! defined $l1 && defined $l2;
$c = -1 if ! defined $l2 && defined $l1;
# debug process
warn "# $v1<=>$v2 : $c\n" if $debug;
# comparison was equal
if ($c == 0) {
if ($op eq 'in' || $op eq 'ou' || $op eq 'le' || $op eq 'ri') {
# output the rows
print OUT $l1, "\t", $l2, "\n";
}
$pl1=$l1;
$pv1=$v1;
$pl2=$l2;
$pv2=$v2;
undef $l1 unless $dup == 2; # 1st can contain dups
undef $l2 unless $dup == 1; # 2nd can contain dups
next;
}
if ($c > 0) { # v1 > v2
if ($pv1 eq $v2) {
if ($uniq==2||$uniq==3) {
# skip dup row always
undef $l2;
next;
} else {
if ($op eq 'ou' || $op eq 'le' || $op eq 'ri') {
print OUT $pl1, "\t", $l2, "\n";
undef $l2;
next;
}
}
}
if (($uniq==2||$uniq==3) && ($pv2 eq $v2)) {
# no dups in file 2
} else {
if ($op eq 'ou' || $op eq 'ri') {
$udt1[$k1]=$v2;
my $ud1 = join "\t", @udt1;
print OUT $ud1, "\t", $l2, "\n";
}
if ($op eq 'ex' || $op eq 'xr') {
print OUT ($fp ? "$f2\t" : '') . $l2 . "\n" if defined $l2;
}
}
$pl2=$l2;
$pv2=$v2;
undef $l2; # read f2 next
}
if ($c < 0) { # v1 < v2
# prev would have matched
if ($pv2 eq $v1) {
if ($uniq==1||$uniq==3) {
# skip dup row always
undef $l1;
next;
} else {
if ($op eq 'ou' || $op eq 'le' || $op eq 'ri') {
# print pasted dup row
print OUT $l1, "\t", $pl2, "\n";
undef $l1;
next;
}
}
}
if (($uniq==1||$uniq==1) && ($pv1 eq $v1)) {
# no dups in file 1
} else {
if ($op eq 'ou' || $op eq 'le') {
print OUT $l1, "\t", $ud2, "\n";
}
if ($op eq 'ex' || $op eq 'xl') {
print OUT ($fp ? "$fp1\t" : '') . $l1 . "\n" if defined $l1;
}
}
$pl1=$l1;
$pv1=$v1;
undef $l1; # read f2 next
}
}
if (@fx) {
@argv = grep !/^$of1$/, @argv;
@argv = grep !/^$of2$/, @argv;
@ARGV = @argv;
my $pk1=$k1+1;
$k1=1;
$presort=undef;
# simulate the options processing WITHOUT those files
Getopt::Long::Configure qw(pass_through);
GetOptions(@PERMUTE_OPTIONS);
GetOptions(@ORDERED_OPTIONS);
shift @ARGV; # strip op
GetOptions(@ORDERED_OPTIONS);
my $of3 = shift @ARGV;
GetOptions(@ORDERED_OPTIONS);
my @addop;
if ($k1 == $pk1 & $presort eq undef) {
push @addop, "-s=1";
}
if ($of3 =~ /^:(.*)/) {
my $nop=$1;
$of3 = shift @ARGV;
for (my $i=0;$i<@argv;++$i) {
# remove :nop
if ($argv[$i] eq ":$nop") {
splice @argv, $i, 1;
last;
}
}
for (my $i=0;$i<@argv;++$i) {
if ($argv[$i] eq $op) {
splice @argv, $i, 1, $nop;
last;
}
}
}
unshift @argv, @addop;
my $nox=0;
for (my $i=0;$i<@argv;++$i) {
if ($argv[$i] eq $of3) {
splice @argv, $i, 1, ($tmerg, $of3);
last;
}
if ($argv[$i] eq "-x") {
$nox=1;
}
}
unshift @argv, "-x" unless $nox;
warn "# + $0 @argv\n" if $debug;
cleanup();
exec($0, @argv);
}
sub sigint {
$SIG{INT} = undef;
cleanup();
kill 2, $$;
}
sub sigterm {
$SIG{TERM} = undef;
cleanup();
kill 15, $$;
}
END {
cleanup();
}
sub cleanup {
if (!$keep) {
warn "# cleanup @tmps\n" if $debug;
for (@tmps) {
unlink $_;
}
}
}
sub preproc_file {
my ($file, $key, $dohead, $dosort) = @_;
die "Won't overwrite $file.sorted\n" if !$make && -e "$file.sorted" && !$overwrite && $dosort;
my $head;
my $of="cat '$file'";
# todo... use tee, fifo to handle stream input
if ($dohead) {
$of.="|tail -n +2";
if ($dosort) {
$head=`head -n 1 '$file'`;
}
chomp $head;
}
if (!$make || (stat("$file.sorted"))[9] <= (stat($file))[9]) {
$of.="|sort $sortop -k $key,$key" if ($dosort);
if ($dosort) {
warn("# + $of > '$file.sorted'\n") if $debug;
system("$of > '$file.sorted'\n") && exit (1);
push @tmps, "$file.sorted";
}
}
my $has_head=$dohead;
if ($dosort) {
$file="$file.sorted";
$has_head = 0;
}
return ($file, $head, $has_head);
}
sub usage {
<<EOF
Usage: xjoin [options] [:]<operator> <f1> <f2> [...*]
Joins file 1 and file 2 by the first column, suitable
for arbitratily large files (disk-based sort).
Operator is one of:
# Pasted ops, combines rows:
in[ner] return rows in common
le[ft] return rows in common, left joined
ri[ght] return rows in common, right joined
ou[ter] return all rows, outer joined
# Exclusive (not pasted) ops, only return rows from 1 file:
ex[clude] return only those rows with nothing in common (see -f)
xl[eft] return left file rows that are not in right file
xr[ight] return right file rows that are not in left file
Common options:
-1,-2=N per file, column number to join on (def 1)
-k=N set the key column to N (for both files)
-d STR column delimiter (def tab)
-q STR quote char (def none)
-h [N] files have headers (optionally, N is the file number)
-u [N] files may contain duplicate entries, only output first match
-s [N] files are already sorted, don't sort first
-n numeric sort key columns
-p prefix headers with filename/
-f prefix rows with the input file name (op:ex only)
Debugging options:
-D debug mode
-K keep sorted files (named file.sorted)
-o always overwrite temp files if present (see -k)
-m use make-semantics with sorted files (see -k)
Options -1,-2, -s, -r, -h are optionally file specific. They can be
called with a number. 1 refers to the left file, 2 refers to the right,
3 (or a missing number) refers to both, -1 means "turn this option off".
Multi file operation:
More than 2 files will result in multiple pairwise calls to
xjoin. The use of the file-specific options can be confusing.
The base, common case... where all the option stay the same, will
not change in future versions.
Multi mode operation:
It's possible to merge two with an inner join, then merge with a left
join, by sticking a colon-prefixed operator in the file list. You
can also change the options at that time.
EXAMPLE1 (common operation):
xjoin in file1 file2 file3
EXAMPLE2 (sort is assumed for 3rd file, but not the first 2):
xjoin in file1 file2 -s file3
EXAMPLE3 (inner join with headers on column 1, then inner join the result using column 3)
xjoin in -h test1.txt test2.txt -k 3 test3.txt
EXAMPLE4 (inner join header on first 2, then outer join, no header on first file)
xjoin in -h test1.txt test2.txt -h 2 :ou test3.txt
TODO: Restructure the options using a custom parser that allows positional
options.... perl's GetOpt isn't flexible enough to capture overriding on
multiple file merges.
EOF
}