/
fixer.py
102 lines (72 loc) · 2.6 KB
/
fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#This just reads in the (new!) broken AES corpus, tries to fix the lines
#and then reads it back out into a new file. That's all folks!
AEStext = open('Brown_ToCode4.csv').read()
import re
#The big problem seems to be a case where a line is quoted (so excel shows
#it fine") and contains a tabbed return. Happens on long lines for whatever
#reason.
matcher = re.compile('''(?:\n|\r|\r\n?)\t''', re.MULTILINE)
#making sure it finds what we want...
n = re.findall(matcher, AEStext)
m = re.search(matcher, AEStext)
#Just get rid of those tab sequences!
AEStext = re.sub(matcher,'', AEStext)
#Quotation marks continue to cause problems!! Look for those terrible
#lines that start with @, either alone on their lines or petering out in a ".
####Alone on a line
matcher2 = re.compile('''(?:\n|\r|\r\n?)@[^,]+(?:\n|\r|\r\n?)''', re.MULTILINE)
#making sure it finds what we want...
n = re.findall(matcher2, AEStext)
#And get rid of this! Leave behind just the return that started the sequence...
AEStext = re.sub(matcher2,'\n',AEStext)
####Now we try to catch the quoted parts...
matcher3 = re.compile('''\"[^\"]+(?:\n|\r|\r\n?)@[^\"]+\"''', re.MULTILINE)
#making sure it finds what we want...
n = re.findall(matcher3, AEStext)
#And get rid of them! Here we carefully preserve the stuff that was on the first
#line before the badness...
def myrepl3(matchobj):
brokesection = matchobj.group(0)
tokeep = brokesection.splitlines()[0] #grabs the first line!
tokeep = tokeep.lstrip('"') #but take off that quotation mark!
return tokeep #these breaks seem to happen in the middle of lines...
AEStext = re.sub(matcher3,myrepl3,AEStext)
##########################
#and check that it's all fixed!
AESlines = AEStext.splitlines()
len(AESlines)
broken = 0
fixed = 0
for i in range(1,len(AESlines)):
numfind = re.match('[0-9]+', AESlines[i]) #Every line should start witha number!
stringnum = numfind.group(0)
if stringnum == str(i):
fixed = i
continue
else:
broken = i
break
##########################
#Wohoo! Read it out!
open('Brownfixed.csv','w').write(AEStext)
#Open up again and check for misaligned columns!
import csv
csvcheck = csv.reader(open('Brownfixed82.csv', 'rb'), delimiter=',', quotechar='"')
broken = 0
fixed = 0
brokenlist = []
for row in csvcheck:
if row[0] == 'Line.Number':
print 'got here'
continue
if row[12] == 'TRUE':
fixed = row[0]
continue
elif row[12] == 'FALSE':
fixed = row[0]
continue
else:
broken = row[0]
print broken
brokenlist.append(broken)
continue