Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…
Cannot retrieve contributors at this time
89 lines (77 sloc) 3.92 KB
# This file is part of NewsFinder.
# Copyright (c) 2011 by the Association for the Advancement of
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.
from datetime import date, timedelta
from AINewsCorpus import AINewsCorpus
from AINewsConfig import config
def add_to_duplicates(duplicates, urlid1, urlid2):
found = False
for dupset in duplicates:
if urlid1 in dupset or urlid2 in dupset:
found = True
if not found:
dupset = set()
class AINewsDuplicates:
def __init__(self):
self.corpus = AINewsCorpus()
def filter_duplicates(self, articles):
date_start = - timedelta(days = int(config['duplicates.days_back']))
date_end =
cutoff = float(config['duplicates.threshold'])
all_articles = self.corpus.get_articles_daterange(date_start, date_end)
duplicates = []
similarities = {}
urlids = sorted(all_articles.keys())
for i in range(0, len(urlids) - 1):
for j in range(i+1, len(urlids)):
# only compare to articles that might be published this week
if urlids[j] not in articles: continue
tfidf1 = all_articles[urlids[i]]['tfidf']
tfidf2 = all_articles[urlids[j]]['tfidf']
similarity = self.corpus.cos_sim(tfidf1, tfidf2)
if similarity >= cutoff:
# if article i has not been published
if not all_articles[urlids[i]]['published']:
add_to_duplicates(duplicates, urlids[i], urlids[j])
similarities[(urlids[i], urlids[j])] = similarity
similarities[(urlids[j], urlids[i])] = similarity
# if article i has already been published,
# then just don't publish article j
articles[urlids[j]]['duplicates'] = \
[(urlids[i], all_articles[urlids[i]]['title'], similarity)]
if articles[urlids[j]]['publish']:
articles[urlids[j]]['publish'] = False
("Rejected because duplicate (sim=%.3f, " +
"cutoff=%.3f) of already published article %s") % \
(similarity, cutoff, str(urlids[i])))
for dupset in duplicates:
for urlid in dupset:
if urlid in articles:
dupset2 = dupset.copy()
articles[urlid]['duplicates'] = \
map(lambda u: (u, articles[u]['title'], similarities[(u,urlid)]),
filter(lambda u: u in articles and (u,urlid) in similarities, dupset2))
sorted_dups = sorted(filter(lambda u: u in articles and articles[u]['publish'], dupset),
cmp=lambda x,y: self.corpus.compare_articles(articles[x], articles[y]),
reverse = True)
if(len(sorted_dups) > 1):
# first in sorted set is chosen; rest are dumped
articles[sorted_dups[0]]['transcript'].append("Preferred over duplicates")
for urlid in sorted_dups[1:]:
if articles[urlid]['publish']:
articles[urlid]['publish'] = False
articles[urlid]['transcript'].append(("Rejected because duplicate " +
"%s was chosen instead") % sorted_dups[0])
Jump to Line
Something went wrong with that request. Please try again.