/
opinionTokenizer.py
72 lines (65 loc) · 2.55 KB
/
opinionTokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from nltk.tokenize import sent_tokenize
import unicodecsv as csv
#Given a string, returns a list with the opinion units it extracted
#from the string
def tokenize_into_opinion_units(text):
output = []
for str in sent_tokenize(text):
for output_str in str.split(' but '):
output.append(output_str)
return output
#Take positive.csv and negative.csv and mix them into
#positiveandnegative.csv
#This has each unit tagged with its booking.com sentiment
#This is the data I tagged with Mechanical Turk
def positive_and_negative_to_full():
fpos = open('positive.csv')
positive_units = [row for row in csv.reader(fpos)]
fneg = open('negative.csv')
negative_units = [row for row in csv.reader(fneg)]
for item in positive_units:
item.append('positive')
for item in negative_units:
item.append('negative')
del negative_units[0]
positive_units[0][0] = 'review_content'
positive_units[0][1] = 'sentiment'
full = positive_units
full.extend(negative_units)
with open('positiveandnegative.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
writer.writerows(full)
#this will open the review scraped data and write two files from that info:
#positive.csv, containing positive opinion units
#negative.csv, containing negative opinion units
if __name__ == "__main__":
#There are some problems with unicode
#TODO take the file name as argument
#positive content:
f = open('itemsBooking.csv')
#divide the data into opinion units:
positive = [tokenize_into_opinion_units(row[1]) for row in csv.reader(f)]
positive_units = []
for row in positive:
for elem in row:
newrow = elem.split(' but ')
for newelem in newrow:
positive_units.append(newelem)
#transform the elements into lists so I can use writerows
positive_units = [[row] for row in positive_units]
with open('positive.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
writer.writerows(positive_units)
#negative content:
f.seek(0)
negative = [tokenize_into_opinion_units(row[4]) for row in csv.reader(f)]
negative_units = []
for row in negative:
for elem in row:
newrow = elem.split(' but ')
for newelem in newrow:
negative_units.append(newelem)
negative_units = [[row] for row in negative_units]
with open('negative.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
writer.writerows(negative_units)