Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

more work on category normalization, 'works' for a subset of pages

  • Loading branch information...
commit edb319513dd346ebeef98f1bd064ee7af29baa4b 1 parent 9d7be3b
David Vetrano davidvetrano authored
27 categories/build-category-hierarchy.py → categories/category_mapper.py
View
@@ -17,24 +17,45 @@
if relation == 'http://www.w3.org/2004/02/skos/core#broader' and firstCat != secondCat:
categoryMap[firstCat] = secondCat
- #print firstCat + ' < ' + secondCat
cat_file.close()
+rootCats = set([
+ 'Category:Agriculture',
+ 'Category:Arts',
+ 'Category:Belief',
+ 'Category:Business',
+ 'Category:Chronology',
+ 'Category:Culture',
+ 'Category:Education',
+ 'Category:Environment',
+ 'Category:Geography',
+ 'Category:Health',
+ 'Category:Humanities',
+ 'Category:Language',
+ 'Category:Law',
+ 'Category:Life',
+ 'Category:Mathematics',
+ 'Category:Nature',
+ 'Category:Politics',
+ 'Category:Science',
+ 'Category:Society',
+ 'Category;Technology'
+])
+
def categoryToTopCategory(category):
visitedSet = set()
curr = category
prev = None
while curr in categoryMap:
- if curr == 'Category:Main_topic_classifications':
+ if curr in rootCats:
return prev
if curr in visitedSet:
return None
visitedSet.add(curr)
-
prev = curr
curr = categoryMap[curr]
22 categories/category_reducer.py
View
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+
+article_id = None
+article_cats = set()
+for line in sys.stdin :
+ line = line.rstrip('\n').split('\t')
+
+ prev_article_id = article_id
+ article_id = line[0]
+ if prev_article_id == article_id:
+ article_cats.add(line[1])
+ else:
+ if prev_article_id is not None and len(article_cats) > 0:
+ output = [ article_id ]
+ for cat in article_cats:
+ output.append(cat)
+ print '\t'.join(output)
+
+ article_cats.clear()
+ article_cats.add(line[1])
+
5 categories/generate-normalized-page-categories.sh
View
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-echo "Generate categories for all pages"
-./build-category-hierarchy.py skos_categories_en.nt categories.sql > page_cats.map
-
5 categories/generate-page-cats.sh
View
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Generate categories for all pages"
+./category_mapper.py skos_categories_en.nt categories.sql | sort | ./category_reducer.py > page_cats
+
Please sign in to comment.
Something went wrong with that request. Please try again.