Permalink
Fetching contributors…
Cannot retrieve contributors at this time
197 lines (174 sloc) 6.27 KB
# This file is part of NewsFinder.
# https://github.com/joshuaeckroth/AINews
#
# Copyright (c) 2011 by the Association for the Advancement of
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.
"""
AINewsTools provides a set of helper functions to manage save/load files
and html/text processing.
"""
import os
import sys
import re
import string
import pickle
import ConfigParser
from unidecode import unidecode
def convert_to_printable(text):
return unidecode(text)
def savefile(filename, content):
"""
Helper function to save content into file.
@param filename: target file's name
@type filename: C{string}
@param content: the content to be saved into the file
@type content: C{string}
"""
try:
f = open(filename , "w")
except IOError , e:
print >> sys.stderr, "Fail to save file %s: %s" % (filename, e)
else:
f.write(content)
f.close()
def loadfile(filename):
"""
Helper function to load content given the filename
@param filename: target file's name
@type filename: C{string}
"""
try:
f = open(filename , "r")
except IOError , e:
print >> sys.stderr, "Fail to load file %s: %s" % (filename, e)
return []
else:
lines = f.readlines()
f.close()
return lines
def loadfile2(filename):
"""
Helper function to load content given the filename
@param filename: target file's name
@type filename: C{string}
"""
try:
f = open(filename , "r")
except IOError , e:
print >> sys.stderr, "Fail to load file %s: %s" % (filename, e)
return ""
else:
content = f.read()
f.close()
return content
def unescape(url):
"""
The url retrieved from MySQL database has extra slash('\') for all the
punctuations.
@param url: the url string to be unescaped
@type url: C{string}
"""
return re.sub(r'\\(.)', r'\1', url)
def loadconfig(filename, config={}):
"""
Helper function to readin the content from config.ini and
returns a dictionary with key's of the form
<section>.<option> and the values
@param filename: target file's name
@type filename: C{string}
@param config: extra configuration content
@type config: C{dict}
"""
config = config.copy()
cp = ConfigParser.ConfigParser()
cp.read(filename)
for sec in cp.sections():
name = string.lower(sec)
for opt in cp.options(sec):
config[name + "." + string.lower(opt)] = \
string.strip(cp.get(sec, opt))
return config
def getwords(raw):
"""
Helper function to extract bag of words from the raw text.
@param raw: target raw text
@type raw: C{string}
"""
if raw =="": return []
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(raw) if s != '']
"""Truncation beautifier function
This simple function attempts to intelligently truncate a given string
__author__ = 'Kelvin Wong <www.kelvinwong.ca>'
__date__ = '2007-06-22'
__version__ = '0.10'
__license__ = 'Python http://www.python.org/psf/license/'
"""
def trunc(s,min_pos=0,max_pos=75,ellipsis=True):
"""Return a nicely shortened string if over a set upper limit
(default 75 characters)
What is nicely shortened? Consider this line from Orwell's 1984...
0---------1---------2---------3---------4---------5---------6---------7---->
When we are omnipotent we shall have no more need of science. There will be
If the limit is set to 70, a hard truncation would result in...
When we are omnipotent we shall have no more need of science. There wi...
Truncating to the nearest space might be better...
When we are omnipotent we shall have no more need of science. There...
The best truncation would be...
When we are omnipotent we shall have no more need of science...
Therefore, the returned string will be, in priority...
1. If the string is less than the limit, just return the whole string
2. If the string has a period, return the string from zero to the first
period from the right
3. If the string has no period, return the string from zero to the first
space
4. If there is no space or period in the range return a hard truncation
In all cases, the string returned will have ellipsis appended unless
otherwise specified.
Parameters:
s = string to be truncated as a String
min_pos = minimum character index to return as Integer (returned
string will be at least this long - default 0)
max_pos = maximum character index to return as Integer (returned
string will be at most this long - default 75)
ellipsis = returned string will have an ellipsis appended to it
before it is returned if this is set as Boolean
(default is True)
Returns:
Truncated String
Throws:
ValueError exception if min_pos > max_pos, indicating improper
configuration
Usage:
short_string = trunc(some_long_string)
or
shorter_string = trunc(some_long_string,max_pos=15,ellipsis=False)
"""
# Sentinel value -1 returned by String function rfind
NOT_FOUND = -1
# Error message for max smaller than min positional error
ERR_MAXMIN = 'Minimum position cannot be greater than maximum position'
# If the minimum position value is greater than max, throw an exception
if max_pos < min_pos:
raise ValueError(ERR_MAXMIN)
# Change the ellipsis characters here if you want a true ellipsis
if ellipsis:
suffix = '...'
else:
suffix = ''
# Case 1: Return string if it is shorter (or equal to) than the limit
length = len(s)
if length <= max_pos:
return s + suffix
else:
# Case 2: Return it to nearest period if possible
try:
end = s.rindex('.',min_pos,max_pos)
except ValueError:
# Case 3: Return string to nearest space
end = s.rfind(' ',min_pos,max_pos)
if end == NOT_FOUND:
end = max_pos
return s[0:end] + suffix