Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/perl | |
| use strict; | |
| use File::Temp; | |
| use Getopt::Long; | |
| Getopt::Long::Configure qw(pass_through no_ignore_case); | |
| $SIG{INT} = \&sigint; | |
| $SIG{TERM} = \&sigterm; | |
| my @argv = @ARGV; | |
| # key1, key2, delimiter, keep | |
| my ($q, $k1, $k2) = ('', 1, 1); | |
| my ($d, $fp, $overwrite, $keep, $presort, $make, $n, $header, $prefix, $multi, $dup, $debug, $uniq); | |
| my ($hf1, $hf2); | |
| my $kalt; | |
| # these apply to the whole shebang | |
| Getopt::Long::Configure qw(permute); | |
| my @PERMUTE_OPTIONS = ("x"=>\$multi, "debug|D"=>\$debug, "K"=>\$keep, "x"=>\$multi, "m"=>\$make, "o"=>\$overwrite); | |
| GetOptions(@PERMUTE_OPTIONS) || die usage(); | |
| # these apply to a file | |
| Getopt::Long::Configure qw(require_order); | |
| my @ORDERED_OPTIONS = ("k=i"=>\$kalt, "1=i"=>\$k1, "2=i"=>\$k2, "q=s"=>\$q, "d=s"=>\$d, "f"=>\$fp, "s:i"=>\$presort, "p"=>\$prefix, "n"=>\$n, "h:i"=>\$header, "u:i"=>\$uniq, "r=i"=>\$dup); | |
| GetOptions(@ORDERED_OPTIONS) || die usage(); | |
| my $op = shift @ARGV; | |
| $op=~s/^://; | |
| Getopt::Long::Configure qw(no_pass_through); | |
| GetOptions(@ORDERED_OPTIONS) || die usage(); | |
| my $f1 = shift @ARGV; | |
| my $f2 = shift @ARGV; | |
| # remaining options, arguments are stored! | |
| my @fx = @ARGV; | |
| $k1=$k2=$kalt if ($kalt); | |
| # if file number is not set, then operate on both input files | |
| $header = 3 if defined($header) && $header==0; | |
| $uniq = 3 if defined($uniq) && $uniq==0; | |
| $header = 0 if defined($header) && $header==-1; | |
| # invert the option | |
| # defined? sort the other, defined and zero? sort both (3-0 = 3) | |
| my $dosort = 3-(0+(defined $presort ? ($presort==0?3:0) : 0)); | |
| my $of1=$f1; | |
| my $of2=$f2; | |
| die "Option -k doesn't make sense with -s or multifile" if (!$dosort && !@fx && !$multi) && $keep; | |
| die usage() unless $op =~ /^inn?e?r?|exc?l?u?d?e?|xle?f?t?|out?e?r?$|xri?g?h?t?|lef?t?|rig?h?t?/; | |
| die usage() if !$f2; | |
| $op = substr($op, 0, 2); | |
| my @tmps; | |
| my $tmerg; | |
| if (@fx) { | |
| $tmerg = "$f2.tmerg"; | |
| open(OUT, ">$tmerg") || die "Can't create $f2.tmerg\n"; | |
| } else { | |
| *OUT=*STDOUT; | |
| } | |
| if ($multi && $f1 =~ /\.tmerg$/) { | |
| push @tmps, $f1; | |
| } | |
| my $sortop; | |
| $sortop .= "-t '$d'" if $d; | |
| $sortop .= '-n' if $n; | |
| my ($f1hashead, $f2hashead); | |
| ($f1, $hf1, $f1hashead) = preproc_file($f1, $k1, $header == 3 || $header == 1, $dosort == 3 || $dosort == 1); | |
| ($f2, $hf2, $f2hashead) = preproc_file($f2, $k2, $header == 3 || $header == 2, $dosort == 3 || $dosort == 2); | |
| warn "# f1:$f1, f2: $f2\n" if $debug; | |
| $d = "\t" if !defined $d; | |
| open(I1, $f1) || die "$f1: $!\n"; | |
| open(I2, $f2) || die "$f2: $!\n"; | |
| my (@B1, @B2, $l); | |
| # buffer in | |
| push @B1, scalar <I1>; | |
| push @B2, scalar <I2>; | |
| # create "undef" rows for outer/left joins | |
| # left side is an empty array, with room for the key field | |
| my $l=$B1[0]; chomp $l; | |
| my @udt1 = split(/$d/, $l); | |
| grep {$_=""} @udt1; | |
| # right side is all delimiter | |
| $l=$B2[0]; chomp $l; | |
| my @t = split(/$d/, $l); | |
| splice @t, $k2, 1; # don't replicate key field | |
| my $ud2 = $d x ((scalar @t) - 1); # undef line 2 | |
| if ($f1hashead) { | |
| # when you sort, the header gets stripped | |
| warn "# discard row 1 file 1\n" if $debug; | |
| $hf1=$B1[0]; chomp $hf1; | |
| @B1=(); | |
| } | |
| if ($f2hashead) { | |
| # when you sort, the header gets stripped | |
| $hf2=$B2[0]; chomp $hf2; | |
| warn "# discard row 1 file 2\n" if $debug; | |
| @B2=(); | |
| } | |
| if ($debug) { | |
| my $nm = $ud2 =~ s/$d/$d/g; | |
| warn "# UD2: --$nm delims--\n"; | |
| } | |
| $k1 -= 1; | |
| $k2 -= 1; | |
| my $outhead = 1; | |
| $outhead=0 if $op eq 'ex'; | |
| $outhead=1 if $tmerg; | |
| if ($outhead && $header && ($hf1 || $hf2)) { | |
| # header line handline | |
| # header on one side? mock up the other | |
| if ($header==1) { | |
| $hf2=$ud2; | |
| } elsif ($header == 2) { | |
| my $ud1 = join "\t", @udt1; | |
| $hf1=$ud1; | |
| } | |
| # pull out key field from header when pasting | |
| if ($op !~ /^xl|xr|ex$/) { | |
| my @t = split(/$d/, $hf2); | |
| splice @t, $k2, 1; | |
| $hf2 = join $d, @t; | |
| } | |
| if ($prefix) { | |
| # paste only | |
| if ($op =~ /^ex|xr|xl/) { | |
| die "Option -p makes no sense for 'ex', 'xr' or 'xl'" . ($op eq 'ex' ? ", maybe you mean -f" : "") . "\n"; | |
| } | |
| my $p1 = $f1; | |
| my $p2 = $f2; | |
| # remove path and file extension | |
| $p1 =~ s/.*\///; | |
| $p2 =~ s/.*\///; | |
| $p1 =~ s/\..*$//; | |
| $p2 =~ s/\..*$//; | |
| if (!$multi) { | |
| my $cnt=0; | |
| if ($op !~ /^ex|xr|xl/) { | |
| $hf1 = join $d, map {s/^/$p1\// if $cnt++ != $k1; $_;} split(/$d/, $hf1); | |
| } | |
| } | |
| if ($op !~ /^ex|xr|xl/) { | |
| $hf2 = join $d, map {s/^/$p2\//; $_;} split(/$d/, $hf2); | |
| } | |
| } | |
| # exclusive? no header for other file | |
| $hf2="" if ($op =~ /^xl$/); | |
| $hf1="" if ($op =~ /^xr|ex$/); | |
| # tab if needed | |
| $hf2="\t" . $hf2 if $hf2&&$hf1; | |
| my $h = "$hf1$hf2\n"; | |
| if ($fp) { | |
| $h="$d$h"; | |
| } | |
| print OUT $h; | |
| } | |
| if ($fp && !($op eq 'ex')) { | |
| die "Option -f only makes sense with ex\n"; | |
| } | |
| my ($l1, $l2, $v1, $v2, @t, $cnt); | |
| my ($pv1, $pv2, $pl1, $pl2); # previous values | |
| my ($fp1); # file prefix 1 | |
| while(1) { | |
| ++$cnt; | |
| if (! defined $l1) { | |
| $l1 = @B1 ? shift @B1 : <I1>; | |
| if ($multi && $fp && defined $l1) { | |
| $l1 =~ s/^([^$d]+)$d//; | |
| $fp1=$1; | |
| } else { | |
| $fp1=$f1; | |
| } | |
| # left join or inner... stop if run out of left file | |
| last if ! defined $l1 && ( $op eq 'in' || $op eq 'xl' || $op eq 'le' ); | |
| if (defined $l1) { | |
| chomp $l1; | |
| # remove quote chars... this is DUMB... use quoted delim parser Regex::Common | |
| @t = split(/$d/, $l1); | |
| if ($q) { | |
| grep s/^$q(.*)$q$/$1/, @t; | |
| } | |
| # key field for file 1 | |
| $v1 = $t[$k1]; | |
| } else { | |
| $v1 = undef; | |
| } | |
| } | |
| if (! defined $l2) { | |
| # inner join or right join... stop if run out of right file | |
| $l2 = @B2 ? shift @B2 : <I2>; | |
| last if ! defined $l2 && ( $op eq 'in' || $op eq 'xr' || $op eq 'ri' ); | |
| if (defined $l2) { | |
| chomp $l2; | |
| # remove quote chars... this is DUMB... use quoted delim parser Regex::Common | |
| @t = split(/$d/, $l2); | |
| if ($q) { | |
| grep s/^$q(.*)$q$/$1/, @t; | |
| } | |
| # key field for file 2 | |
| $v2 = $t[$k2]; | |
| if ($op !~ /^ex|xr|xl/) { | |
| splice @t, $k2, 1; # don't replicate key field | |
| $l2 = join $d, @t; | |
| } | |
| } else { | |
| $v2 = undef; | |
| } | |
| } | |
| # both empty? done. | |
| last if ! defined $l2 && ! defined $l1; | |
| # numeric sort? | |
| my $c = $n ? ($v1 <=> $v2) : ($v1 cmp $v2); | |
| # undef is always "less than" | |
| $c = 1 if ! defined $l1 && defined $l2; | |
| $c = -1 if ! defined $l2 && defined $l1; | |
| # debug process | |
| warn "# $v1<=>$v2 : $c\n" if $debug; | |
| # comparison was equal | |
| if ($c == 0) { | |
| if ($op eq 'in' || $op eq 'ou' || $op eq 'le' || $op eq 'ri') { | |
| # output the rows | |
| print OUT $l1, "\t", $l2, "\n"; | |
| } | |
| $pl1=$l1; | |
| $pv1=$v1; | |
| $pl2=$l2; | |
| $pv2=$v2; | |
| undef $l1 unless $dup == 2; # 1st can contain dups | |
| undef $l2 unless $dup == 1; # 2nd can contain dups | |
| next; | |
| } | |
| if ($c > 0) { # v1 > v2 | |
| if ($pv1 eq $v2) { | |
| if ($uniq==2||$uniq==3) { | |
| # skip dup row always | |
| undef $l2; | |
| next; | |
| } else { | |
| if ($op eq 'ou' || $op eq 'le' || $op eq 'ri') { | |
| print OUT $pl1, "\t", $l2, "\n"; | |
| undef $l2; | |
| next; | |
| } | |
| } | |
| } | |
| if (($uniq==2||$uniq==3) && ($pv2 eq $v2)) { | |
| # no dups in file 2 | |
| } else { | |
| if ($op eq 'ou' || $op eq 'ri') { | |
| $udt1[$k1]=$v2; | |
| my $ud1 = join "\t", @udt1; | |
| print OUT $ud1, "\t", $l2, "\n"; | |
| } | |
| if ($op eq 'ex' || $op eq 'xr') { | |
| print OUT ($fp ? "$f2\t" : '') . $l2 . "\n" if defined $l2; | |
| } | |
| } | |
| $pl2=$l2; | |
| $pv2=$v2; | |
| undef $l2; # read f2 next | |
| } | |
| if ($c < 0) { # v1 < v2 | |
| # prev would have matched | |
| if ($pv2 eq $v1) { | |
| if ($uniq==1||$uniq==3) { | |
| # skip dup row always | |
| undef $l1; | |
| next; | |
| } else { | |
| if ($op eq 'ou' || $op eq 'le' || $op eq 'ri') { | |
| # print pasted dup row | |
| print OUT $l1, "\t", $pl2, "\n"; | |
| undef $l1; | |
| next; | |
| } | |
| } | |
| } | |
| if (($uniq==1||$uniq==1) && ($pv1 eq $v1)) { | |
| # no dups in file 1 | |
| } else { | |
| if ($op eq 'ou' || $op eq 'le') { | |
| print OUT $l1, "\t", $ud2, "\n"; | |
| } | |
| if ($op eq 'ex' || $op eq 'xl') { | |
| print OUT ($fp ? "$fp1\t" : '') . $l1 . "\n" if defined $l1; | |
| } | |
| } | |
| $pl1=$l1; | |
| $pv1=$v1; | |
| undef $l1; # read f2 next | |
| } | |
| } | |
| if (@fx) { | |
| @argv = grep !/^$of1$/, @argv; | |
| @argv = grep !/^$of2$/, @argv; | |
| @ARGV = @argv; | |
| my $pk1=$k1+1; | |
| $k1=1; | |
| $presort=undef; | |
| # simulate the options processing WITHOUT those files | |
| Getopt::Long::Configure qw(pass_through); | |
| GetOptions(@PERMUTE_OPTIONS); | |
| GetOptions(@ORDERED_OPTIONS); | |
| shift @ARGV; # strip op | |
| GetOptions(@ORDERED_OPTIONS); | |
| my $of3 = shift @ARGV; | |
| GetOptions(@ORDERED_OPTIONS); | |
| my @addop; | |
| if ($k1 == $pk1 & $presort eq undef) { | |
| push @addop, "-s=1"; | |
| } | |
| if ($of3 =~ /^:(.*)/) { | |
| my $nop=$1; | |
| $of3 = shift @ARGV; | |
| for (my $i=0;$i<@argv;++$i) { | |
| # remove :nop | |
| if ($argv[$i] eq ":$nop") { | |
| splice @argv, $i, 1; | |
| last; | |
| } | |
| } | |
| for (my $i=0;$i<@argv;++$i) { | |
| if ($argv[$i] eq $op) { | |
| splice @argv, $i, 1, $nop; | |
| last; | |
| } | |
| } | |
| } | |
| unshift @argv, @addop; | |
| my $nox=0; | |
| for (my $i=0;$i<@argv;++$i) { | |
| if ($argv[$i] eq $of3) { | |
| splice @argv, $i, 1, ($tmerg, $of3); | |
| last; | |
| } | |
| if ($argv[$i] eq "-x") { | |
| $nox=1; | |
| } | |
| } | |
| unshift @argv, "-x" unless $nox; | |
| warn "# + $0 @argv\n" if $debug; | |
| cleanup(); | |
| exec($0, @argv); | |
| } | |
| sub sigint { | |
| $SIG{INT} = undef; | |
| cleanup(); | |
| kill 2, $$; | |
| } | |
| sub sigterm { | |
| $SIG{TERM} = undef; | |
| cleanup(); | |
| kill 15, $$; | |
| } | |
| END { | |
| cleanup(); | |
| } | |
| sub cleanup { | |
| if (!$keep) { | |
| warn "# cleanup @tmps\n" if $debug; | |
| for (@tmps) { | |
| unlink $_; | |
| } | |
| } | |
| } | |
| sub preproc_file { | |
| my ($file, $key, $dohead, $dosort) = @_; | |
| die "Won't overwrite $file.sorted\n" if !$make && -e "$file.sorted" && !$overwrite && $dosort; | |
| my $head; | |
| my $of="cat '$file'"; | |
| # todo... use tee, fifo to handle stream input | |
| if ($dohead) { | |
| $of.="|tail -n +2"; | |
| if ($dosort) { | |
| $head=`head -n 1 '$file'`; | |
| } | |
| chomp $head; | |
| } | |
| if (!$make || (stat("$file.sorted"))[9] <= (stat($file))[9]) { | |
| $of.="|sort $sortop -k $key,$key" if ($dosort); | |
| if ($dosort) { | |
| warn("# + $of > '$file.sorted'\n") if $debug; | |
| system("$of > '$file.sorted'\n") && exit (1); | |
| push @tmps, "$file.sorted"; | |
| } | |
| } | |
| my $has_head=$dohead; | |
| if ($dosort) { | |
| $file="$file.sorted"; | |
| $has_head = 0; | |
| } | |
| return ($file, $head, $has_head); | |
| } | |
| sub usage { | |
| <<EOF | |
| Usage: xjoin [options] [:]<operator> <f1> <f2> [...*] | |
| Joins file 1 and file 2 by the first column, suitable | |
| for arbitratily large files (disk-based sort). | |
| Operator is one of: | |
| # Pasted ops, combines rows: | |
| in[ner] return rows in common | |
| le[ft] return rows in common, left joined | |
| ri[ght] return rows in common, right joined | |
| ou[ter] return all rows, outer joined | |
| # Exclusive (not pasted) ops, only return rows from 1 file: | |
| ex[clude] return only those rows with nothing in common (see -f) | |
| xl[eft] return left file rows that are not in right file | |
| xr[ight] return right file rows that are not in left file | |
| Common options: | |
| -1,-2=N per file, column number to join on (def 1) | |
| -k=N set the key column to N (for both files) | |
| -d STR column delimiter (def tab) | |
| -q STR quote char (def none) | |
| -h [N] files have headers (optionally, N is the file number) | |
| -u [N] files may contain duplicate entries, only output first match | |
| -s [N] files are already sorted, don't sort first | |
| -n numeric sort key columns | |
| -p prefix headers with filename/ | |
| -f prefix rows with the input file name (op:ex only) | |
| Debugging options: | |
| -D debug mode | |
| -K keep sorted files (named file.sorted) | |
| -o always overwrite temp files if present (see -k) | |
| -m use make-semantics with sorted files (see -k) | |
| Options -1,-2, -s, -r, -h are optionally file specific. They can be | |
| called with a number. 1 refers to the left file, 2 refers to the right, | |
| 3 (or a missing number) refers to both, -1 means "turn this option off". | |
| Multi file operation: | |
| More than 2 files will result in multiple pairwise calls to | |
| xjoin. The use of the file-specific options can be confusing. | |
| The base, common case... where all the option stay the same, will | |
| not change in future versions. | |
| Multi mode operation: | |
| It's possible to merge two with an inner join, then merge with a left | |
| join, by sticking a colon-prefixed operator in the file list. You | |
| can also change the options at that time. | |
| EXAMPLE1 (common operation): | |
| xjoin in file1 file2 file3 | |
| EXAMPLE2 (sort is assumed for 3rd file, but not the first 2): | |
| xjoin in file1 file2 -s file3 | |
| EXAMPLE3 (inner join with headers on column 1, then inner join the result using column 3) | |
| xjoin in -h test1.txt test2.txt -k 3 test3.txt | |
| EXAMPLE4 (inner join header on first 2, then outer join, no header on first file) | |
| xjoin in -h test1.txt test2.txt -h 2 :ou test3.txt | |
| TODO: Restructure the options using a custom parser that allows positional | |
| options.... perl's GetOpt isn't flexible enough to capture overriding on | |
| multiple file merges. | |
| EOF | |
| } | |