Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

helps if I add the replacement

  • Loading branch information...
commit 051dc6e00a680ecda837663b61fdf8fa906b2d9a 1 parent 41cfa77
David R. MacIver authored
Showing with 112 additions and 0 deletions.
  1. +112 −0 pearsons.c
View
112 pearsons.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "lines.h"
+#include "hash_counter.h"
+
+double pearsons(double num_records, double cooccurrence_count, double count_x, double count_y){
+ double mu1 = count_x / num_records;
+ double mu2 = count_y / num_records;
+
+ double ss1 = mu1 - mu1 * mu1;
+ double ss2 = mu2 - mu2 * mu2;
+
+ return (cooccurrence_count / num_records - mu1 * mu2) / sqrt(ss1 * ss2);
+}
+
+int main(int argc, char **args){
+ FILE *source;
+
+ if(argc > 2){
+ fprintf(stderr, "Usage: Either invoke argumentless to read from stdin or with a single argument to read from a file");
+ exit(1);
+ } else if (argc == 2){
+ source = fopen(args[1], "r");
+ } else {
+ source = stdin;
+ }
+
+ hash_counter cooccurrence_counter = hash_counter_create();
+ hash_counter token_counter = hash_counter_create();
+
+ int token_count;
+ int max_token_count = 64;
+ char **tokens = malloc(max_token_count * sizeof(char*));
+
+ int token_buffer_size = current_line_length;
+ char* token_buffer = malloc(current_line_length);
+
+ int num_records = 0;
+
+ while(read_line(source)){
+ num_records++;
+ if(token_buffer_size < current_line_length){
+ token_buffer_size = current_line_length;
+ token_buffer = realloc(token_buffer, token_buffer_size);
+ }
+ token_count = 0;
+ char *c = current_line;
+ int in_token = 1;
+ while(*c != '\0'){
+ if(*c == ' ') {
+ *c = '\0';
+ in_token = 0;
+ }
+ else if (!in_token){
+ in_token = 1;
+ if(token_count == max_token_count){
+ max_token_count *= 2;
+ tokens = realloc(tokens, max_token_count * sizeof(char*));
+ }
+ tokens[token_count++] = c;
+ }
+ c++;
+ }
+
+ int i, j;
+ for(i = 0; i < token_count; i++){
+ hash_counter_increment(token_counter, tokens[i]);
+ strcpy(token_buffer, tokens[i]);
+ int len = strlen(tokens[i]);
+ token_buffer[len] = ' ';
+ char *next_token = token_buffer + len + 1;
+
+ for(j = 0; j < token_count; j++){
+ strcpy(next_token, tokens[j]);
+ hash_counter_increment(cooccurrence_counter, token_buffer);
+ }
+ }
+ }
+
+ int i;
+ for(i = 0; i < (cooccurrence_counter->num_records); i++){
+ hash_record record = (cooccurrence_counter->records)[i];
+ if(record.key){
+ int cc = record.value;
+
+ char *key1 = record.key;
+ char *key2 = key1;
+
+ while(*key2 != ' ') key2++;
+
+ *key2 = '\0';
+ key2++;
+
+
+ int key1_count = hash_counter_get_value(token_counter, key1);
+ int key2_count = hash_counter_get_value(token_counter, key2);
+
+ double p = pearsons(num_records, cc, key1_count, key2_count);
+
+ printf("%s %s %f\n", key1, key2, p);
+ }
+ }
+
+ // Clean up
+ hash_counter_destroy(cooccurrence_counter);
+ reset_lines();
+ fclose(source);
+
+ return 0;
+}
Please sign in to comment.
Something went wrong with that request. Please try again.