Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Braid: Added mirror 'tally' at 'f3eb780'

  • Loading branch information...
commit 8b80e9cf61c2712823038879727c9a3817369ae8 1 parent 26db734
David R. MacIver authored
8 .braids
View
@@ -0,0 +1,8 @@
+---
+tally:
+ squashed: true
+ url: git://github.com/DRMacIver/tally.git
+ branch: master
+ type: git
+ revision: f3eb780739904ae6609116c89f477f8b44efe30c
+ remote: braid/tally
6 tally/README
View
@@ -0,0 +1,6 @@
+A simple C program for string counting. It's totally trivial functionality wise, but I needed a fast one (sort + uniq wasn't cutting it), so I implemented this.
+
+Reads lines in from stdin or a file. Outputs lines in the form
+count line
+
+Stores everything in memory, so relies on the virtual memory system to deal with paging off to disk. This results in it being substantially faster than the trasitional sort | uniq -c approach.
80 tally/Rakefile
View
@@ -0,0 +1,80 @@
+require 'rake/clean'
+
+CLEAN.include('*.o', "sample", "reference")
+CLOBBER.include('tally', "smalldata")
+
+SRC = FileList['*.c']
+HEADERS = FileList['*.h'];
+OBJ = SRC.ext('o')
+
+DATA_LINE_LENGTH=(ENV["DATA_LINE_LENGTH"]||500).to_i
+
+CC_OPTS="-Wall#{" -pg" if ENV["PROFILE"]} -O3"
+
+rule '.o' => '.c' do |t|
+ sh "cc #{CC_OPTS} -c -o #{t.name} #{t.source}"
+end
+
+file "tally" => OBJ do
+ sh "cc #{CC_OPTS} -o tally #{OBJ}"
+end
+
+file "kitten" => ["kitten.o", "lines.o"] do
+ sh "cc #{CC_OPTS} -o kitten kitten.o lines.o"
+end
+
+task :default => "tally"
+
+file "kitten" => "kitten.o"
+file 'tally' => 'tally.o';
+file 'tally.o' => HEADERS + FileList['tally.c']
+file 'hash_tally.o' => 'hash.c'
+
+task :run => :compile do
+ sh "tally"
+end
+
+file "smalldata" do
+ text = ["foo", "bar", "baz", "bif", "bing"]
+
+ File.open("smalldata", "w") do |out|
+ 100000.times do |i|
+ out.puts((0..rand(DATA_LINE_LENGTH)).map{text[rand(text.length)]}.join(" "))
+ end
+ end
+end
+
+file "data" => "smalldata" do
+ sh "cp smalldata data"
+ 5.times do
+ sh "cat data data > data2 && mv data2 data"
+ end
+end
+
+task "time" => ["tally", "data"] do
+ start = Time.now
+ sh "./tally < data > /dev/null"
+ puts (Time.now - start)
+end
+
+file "reference" => "smalldata" do
+ sh "LC_ALL=C sort < smalldata | uniq -c | sed 's/^ \\+//' | sort > reference"
+end
+
+file "sample" => ["tally", "smalldata"] do
+ sh "./tally smalldata | sort > sample"
+end
+
+task "test" => ["reference", "sample"] do
+ m1, f1, m2, f2 = %x{md5sum reference sample}.split
+
+ puts "#{f1}: #{m1}"
+ puts "#{f2}: #{m2}"
+
+ if (m1 != m2)
+ STDERR.puts "md5s differ"
+ exit(-1)
+ else
+ puts "md5s the same"
+ end
+end
61 tally/hash.c
View
@@ -0,0 +1,61 @@
+// Paul Hsieh's hash function: http://www.azillionmonkeys.com/qed/hash.html
+#include "stdint.h"
+#include <string.h>
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+uint32_t SuperFastHash (const char * data) {
+ int len = strlen(data);
+
+uint32_t hash = len, tmp;
+int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (uint16_t);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= data[sizeof (uint16_t)] << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += *data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
5 tally/hash.h
View
@@ -0,0 +1,5 @@
+#ifndef SUPER_FAST_HASH
+#define SUPER_FAST_HASH
+#include "stdint.h"
+uint32_t SuperFastHash (const char *data);
+#endif
135 tally/hash_counter.c
View
@@ -0,0 +1,135 @@
+#include <stdlib.h>
+#include <string.h>
+#include "hash.h"
+
+#define DEFAULT_HASH_SIZE 1024
+#define DEFAULT_STORAGE_SIZE 1024
+#define MAX_OCCUPANCY 0.75
+#define hash_of_string SuperFastHash
+
+typedef struct {
+ uint32_t hash;
+ char *key;
+ int value;
+} hash_record;
+
+typedef struct {
+ hash_record *records;
+ int num_records;
+ int occupied_records;
+} hash_counter_struct;
+
+typedef hash_counter_struct *hash_counter;
+
+hash_counter hash_counter_create(){
+ hash_counter it = malloc(sizeof(hash_counter_struct));
+ it->num_records = DEFAULT_HASH_SIZE;
+ it->occupied_records = 0;
+
+ int records_size = sizeof(hash_record) * DEFAULT_HASH_SIZE;
+ it->records = malloc(records_size);
+ memset(it->records, '\0', records_size);
+ return it;
+}
+
+void hash_counter_destroy(hash_counter hc){
+ int i;
+ for(i = 0; i < (hc->num_records); i++){
+ hash_record record = (hc->records)[i];
+ if(record.key) free(record.key);
+ }
+ free(hc->records);
+ free(hc);
+}
+
+hash_record* find_record_with_hash(hash_counter it, char *string, uint32_t hash){
+ int mask = it->num_records - 1; // num_records is a power of two
+ int index = (hash & mask);
+
+
+ int perturb = index;
+ int j = hash;
+
+ hash_record *target = (it->records) + index;
+ while(target->key){
+ if((target->hash == hash) && (0 == strcmp(target->key, string))){
+ return target;
+ }
+
+ j = 5 * j + 1 + perturb;
+ perturb >>= 5;
+ index = j & mask;
+ target = (it->records) + index;
+ }
+
+ // We make sure the target has the right hash here. We might not
+ // actually care about this, but if we don't it will just get
+ // overwritten later.
+ target->hash = hash;
+
+ return target;
+}
+
+hash_record* find_record(hash_counter it, char *string){
+ return find_record_with_hash(it, string, hash_of_string(string));
+}
+
+
+int grow_hash_counter(hash_counter it){
+ if(it->occupied_records >= MAX_OCCUPANCY * it->num_records){
+ hash_record *old_records = it->records;
+ int old_count = it->num_records;
+
+ it->num_records *= 2;
+
+ int records_size = it->num_records * sizeof(hash_record);
+ it->records = malloc(records_size);
+ memset(it->records, '\0', records_size);
+
+ int i;
+ for(i = 0; i < old_count; i++){
+ hash_record *record = (old_records) + i;
+ if(record->key){
+ // should redesign to reuse hash here
+ hash_record *new_record = find_record(it, record->key);
+ new_record->key = record->key;
+ new_record->value = record->value;
+ }
+ }
+ free(old_records);
+ return 1;
+ }
+ return 0;
+}
+
+int hash_counter_count(hash_counter it, char *string){
+ return find_record(it, string)->value;
+}
+
+int hash_counter_size(hash_counter it){
+ return it->occupied_records;
+}
+
+hash_record *find_or_create_record(hash_counter it, char *key){
+ hash_record *the_record = find_record(it, key);
+ if(!the_record->key){
+ if(grow_hash_counter(it)){
+ the_record = find_record(it, key);
+ }
+ the_record->key = strdup(key);
+ it->occupied_records++;
+ }
+ return the_record;
+}
+
+void hash_counter_set_value(hash_counter it, char *key, int value){
+ find_or_create_record(it, key)->value = value;
+}
+
+int hash_counter_get_value(hash_counter it, char *key){
+ return find_record(it, key)->value;
+}
+
+void hash_counter_increment(hash_counter it, char *key){
+ find_or_create_record(it, key)->value++;
+}
25 tally/hash_counter.h
View
@@ -0,0 +1,25 @@
+#ifndef HASH_COUNTER
+#define HASH_COUNTER
+#include <stdint.h>
+
+typedef struct {
+ uint32_t hash;
+ char *key;
+ int value;
+} hash_record;
+
+typedef struct {
+ hash_record *records;
+ int num_records;
+ int occupied_records;
+} hash_counter_struct;
+
+typedef hash_counter_struct *hash_counter;
+
+extern hash_counter hash_counter_create();
+extern void hash_counter_destroy(hash_counter it);
+extern void hash_counter_increment(hash_counter it, char *key);
+extern int hash_counter_get_value(hash_counter it, char *key);
+extern int hash_counter_size(hash_counter it);
+
+#endif
9 tally/kitten.c
View
@@ -0,0 +1,9 @@
+#include <stdio.h>
+#include "lines.h"
+
+int main(){
+ while(read_line(stdin)){
+ printf("%s \n", current_line);
+ }
+ return 0;
+}
49 tally/lines.c
View
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int current_line_length = 1024;
+char *current_line = NULL;
+
+int hit_end = 0;
+
+int read_line(FILE *file){
+ if(!current_line){
+ current_line = malloc(current_line_length);
+ // better error handling needed here. We're treating failing to
+ // allocate a new line as the same as reaching EOF. This is idiotic.
+ if (!current_line) return 0;
+ }
+
+ if(hit_end) return 0;
+
+ memset(current_line, '\0', current_line_length);
+
+ int i = 0;
+ while(1){
+ int c = fgetc(file);
+
+ if(c == EOF){
+ if(i == 0) return 0;
+ hit_end = 1;
+ return 1;
+ }
+
+ if(c == '\n') return 1;
+
+ if(i >= current_line_length - 1){
+ current_line = realloc(current_line, current_line_length * 2);
+ memset(current_line + current_line_length, '\0', current_line_length);
+ current_line_length *= 2;
+ }
+
+ current_line[i++] = c;
+ }
+}
+
+extern void reset_lines(){
+ if(current_line){
+ free(current_line);
+ current_line = NULL;
+ }
+}
6 tally/lines.h
View
@@ -0,0 +1,6 @@
+#ifndef SILLY_LINE_READING
+#define SILLY_LINE_READING
+extern char *current_line;
+extern int read_line(FILE *file);
+extern void reset_lines();
+#endif
37 tally/tally.c
View
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "lines.h"
+#include "hash_counter.h"
+
+int main(int argc, char **args){
+ FILE *source;
+
+ if(argc > 2){
+ fprintf(stderr, "Usage: Either invoke argumentless to read from stdin or with a single argument to read from a file");
+ exit(1);
+ } else if (argc == 2){
+ source = fopen(args[1], "r");
+ } else {
+ source = stdin;
+ }
+
+ hash_counter hc = hash_counter_create();
+
+ while(read_line(source)){
+ hash_counter_increment(hc, current_line);
+ }
+
+ int i;
+ for(i = 0; i < (hc->num_records); i++){
+ hash_record record = (hc->records)[i];
+ if(record.key)
+ printf("%i %s\n", record.value, record.key);
+ }
+
+ // Clean up
+ hash_counter_destroy(hc);
+ reset_lines();
+ fclose(source);
+
+ return 0;
+}
Please sign in to comment.
Something went wrong with that request. Please try again.