/
Deduplication.py
102 lines (82 loc) · 3.54 KB
/
Deduplication.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Imports
import cPickle as pickle
import write_functions as w
import nltk
import nltk.corpus
import nltk.tokenize.punkt
import string
import nltk.data
import nltk.stem.snowball
from collections import OrderedDict
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# Declare variables
unprocessed_sentences = []
# Import the sentences that need processing in pickle
filename = "onelinefun.pickle"
unprocessed_sentences = pickle.load(open(filename))
print "File succesfully imported. The file contains %d sentences." %len(unprocessed_sentences)
#in case more pickle files need to be imported
filename2 = 'goodriddlesnow.pickle'
unprocessed_sentences2 = pickle.load(open(filename2))
print "File succesfully imported. The file contains %d sentences." %len(unprocessed_sentences2)
for i in unprocessed_sentences2:
unprocessed_sentences.append(i)
filename3 = 'funnyshortjokes.pickle'
unprocessed_sentences3 = pickle.load(open(filename3))
print "File succesfully imported. The file contains %d sentences." %len(unprocessed_sentences3)
for i in unprocessed_sentences3:
unprocessed_sentences.append(i)
filename4 = 'laughfactory.pickle'
unprocessed_sentences4 = pickle.load(open(filename4))
print "File succesfully imported. The file contains %d sentences." %len(unprocessed_sentences4)
for i in unprocessed_sentences4:
unprocessed_sentences.append(i)
filename5 = 'humorous_oneliners_undoubled.pickle'
unprocessed_sentences5 = pickle.load(open(filename5))
print "File succesfully imported. The file contains %d sentences." %len(unprocessed_sentences5)
for i in unprocessed_sentences5:
unprocessed_sentences.append(i)
file_object = open('Danoah_oneliners_60.txt','r')
textfile_oneliners = file_object.readlines()
unprocessed_sentences += textfile_oneliners
print "%d lines were imported and are ready for deduplication!"%len(unprocessed_sentences)
# In[19]:
# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
def is_ci_token_stopword_set_match(tokenset_a, tokenset_b, threshold=0.9):
"""Check if a and b are matches, using jaccard coefficient."""
ratio = len(set(tokenset_a).intersection(tokenset_b)) / (float(len(set(tokenset_a).union(tokenset_b)))+0.00000000000000001)
return (ratio >= threshold)
def generate_bow(sentence):
tokenset = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sentence)\
if token.lower().strip(string.punctuation) not in stopwords]
return tokenset
# declare variables
remove_items = []
keep_sentences = []
tokensets = []
#Generate all bows
for i in unprocessed_sentences:
tokensets.append(generate_bow(i))
print "Finished creating bow representations."
# Compare all sentences s for similarity with the sentences on index > s
for i in range(len(unprocessed_sentences)+1):
if i not in remove_items:
for item in range(i+1,len(unprocessed_sentences)):
if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True:
remove_items.append(item)
if i%50 == 0: # Used to track progress of the deduplication
print "processed %d sentences"%i
remove_items = list(OrderedDict.fromkeys(remove_items))
print "Removed items:"
print remove_items
for item in range(len(unprocessed_sentences)):
if item not in remove_items:
keep_sentences.append(unprocessed_sentences[item])
print "Amount of kept items: %d"%len(keep_sentences)
# Write away processed data
w.write_to_pickle("humorous_jokes",keep_sentences)