Skip to content
Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
118 lines (97 sloc) 3.27 KB
#-*- coding: utf-8 -*-
# Python 2.7
from lxml import etree
from sets import Set
import sys, time, re, os, frobenoid, Levenshtein
def wikipedia_main_namespace(title):
if title.startswith ("Wikipedia:"):
return False
if title.startswith ("Wikipedia talk:"):
return False
if title.startswith ("User:"):
return False
if title.startswith ("User talk:"):
return False
if title.startswith ("Category:"):
return False
if title.startswith ("Category talk:"):
return False
if title.startswith ("Talk:"):
return False
if title.startswith ("File:"):
return False
if title.startswith ("File talk:"):
return False
if title.startswith ("Template:"):
return False
if title.startswith ("Template talk:"):
return False
if title.startswith ("Portal:"):
return False
if title.startswith ("Special:"):
return False
if title.startswith (":"):
return False
return True
def process_file (fname, words_stat):
current_tags=[]
tmp="{http://www.mediawiki.org/xml/export-0.10/}"
namespaces = []
cur_title=""
context = etree.iterparse(fname, events=("start", "end", "start-ns", "end-ns"))
for event, elem in context:
if event=="start-ns":
namespaces.insert(0, elem)
if event=="end-ns":
namespaces.pop(0)
if event=="start":
current_tags.append (elem.tag)
if event=="end":
if current_tags[-1]!=elem.tag:
raise AssertionError
current_tags.pop()
if elem.tag==tmp+"title":
cur_title=elem.text
if elem.tag==tmp+"text":
if wikipedia_main_namespace(cur_title) and elem.text!=None: # FIXME
# this is text in main namespace
for x in re.split('\s+', elem.text):
l=unicode(x.lower())
if len(l)>5 and frobenoid.str_is_latin(l):
frobenoid.inc_value_in_dict(words_stat, l)
if elem.tag==tmp+"page":
pass
elem.clear()
words_stat={}
i=1
args=len(sys.argv)
while i<args:
fname=sys.argv[i]
sys.stderr.write ("parsing "+fname+"...\n")
process_file(fname, words_stat)
sys.stderr.write (fname+" parsed\n")
i=i+1
if i==1:
print "no files at input"
print "usage: enwiki* > typos"
exit(0)
words_stat_len=len(words_stat)
dictionary_word_threshold=words_stat_len/500
typo_threshold=dictionary_word_threshold/100 # 1%
print "words_stat_len=", words_stat_len
print "dictionary_word_threshold=", dictionary_word_threshold
print "typo_threshold=", typo_threshold
probably_correct_words=filter(lambda x: words_stat[x]>dictionary_word_threshold, words_stat)
print "len(probably_correct_words)=", len(probably_correct_words)
words_to_check_unsorted=filter(lambda x: words_stat[x]<typo_threshold, words_stat)
words_to_check=sorted(words_to_check_unsorted, key=lambda x: words_stat[x], reverse=False)
print "len(words_to_check)=", len(words_to_check)
for w in words_to_check:
suggestions=[]
for wd in probably_correct_words:
dist=Levenshtein.distance(w, wd)
if dist==1:
#if 1 <= dist <= 2:
suggestions.append(wd)
if len(suggestions)>0:
print "typo?",w,"("+str(words_stat[w])+") suggestions=", suggestions
You can’t perform that action at this time.