Permalink
Browse files

more work on category normalization, 'works' for a subset of pages

  • Loading branch information...
1 parent 9d7be3b commit edb319513dd346ebeef98f1bd064ee7af29baa4b @davidvetrano davidvetrano committed May 15, 2012
View
27 categories/build-category-hierarchy.py → categories/category_mapper.py
@@ -17,24 +17,45 @@
if relation == 'http://www.w3.org/2004/02/skos/core#broader' and firstCat != secondCat:
categoryMap[firstCat] = secondCat
- #print firstCat + ' < ' + secondCat
cat_file.close()
+rootCats = set([
+ 'Category:Agriculture',
+ 'Category:Arts',
+ 'Category:Belief',
+ 'Category:Business',
+ 'Category:Chronology',
+ 'Category:Culture',
+ 'Category:Education',
+ 'Category:Environment',
+ 'Category:Geography',
+ 'Category:Health',
+ 'Category:Humanities',
+ 'Category:Language',
+ 'Category:Law',
+ 'Category:Life',
+ 'Category:Mathematics',
+ 'Category:Nature',
+ 'Category:Politics',
+ 'Category:Science',
+ 'Category:Society',
+ 'Category;Technology'
+])
+
def categoryToTopCategory(category):
visitedSet = set()
curr = category
prev = None
while curr in categoryMap:
- if curr == 'Category:Main_topic_classifications':
+ if curr in rootCats:
return prev
if curr in visitedSet:
return None
visitedSet.add(curr)
-
prev = curr
curr = categoryMap[curr]
View
22 categories/category_reducer.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+
+article_id = None
+article_cats = set()
+for line in sys.stdin :
+ line = line.rstrip('\n').split('\t')
+
+ prev_article_id = article_id
+ article_id = line[0]
+ if prev_article_id == article_id:
+ article_cats.add(line[1])
+ else:
+ if prev_article_id is not None and len(article_cats) > 0:
+ output = [ article_id ]
+ for cat in article_cats:
+ output.append(cat)
+ print '\t'.join(output)
+
+ article_cats.clear()
+ article_cats.add(line[1])
+
View
5 categories/generate-normalized-page-categories.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-echo "Generate categories for all pages"
-./build-category-hierarchy.py skos_categories_en.nt categories.sql > page_cats.map
-
View
5 categories/generate-page-cats.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Generate categories for all pages"
+./category_mapper.py skos_categories_en.nt categories.sql | sort | ./category_reducer.py > page_cats
+

0 comments on commit edb3195

Please sign in to comment.