-
Notifications
You must be signed in to change notification settings - Fork 0
/
interviewParseProto.py
80 lines (62 loc) · 2.6 KB
/
interviewParseProto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from __future__ import unicode_literals
import spacy, re
# Set up spaCy
from spacy.en import English
parser = English()
# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
"The knack lies in learning how to throw yourself at the ground and miss." \
"In the beginning the Universe was created. This has made a lot of people "\
"very angry and been widely regarded as a bad move."
fname = "Aidan Blant, Midway City, CA 080417.001.txt"
with open(fname) as f:
content = f.readlines()
content = [x.strip() for x in content]
# print("len(content): ",len(content));
## PROCESS HEADER FOR NAME, LOCATION, INTERVIEWID
# First line should be the name, city, and interview id
header = re.split(",",content[0])
fname,lname = header[0].split()
city = header[1]
state,idnum = header[2].split()
print("First: {} | Last: {} | City: {} | State {} | id#: {}".format(fname,lname, city,state,idnum))
## SEPERATE DIALOGUE SPOKEN BY INTERVIEWER AND INTERVIEW SUBJECT
# In these examples, Paula's lines start with her full name, or initial
# Unfortunately there are lines with NO indicator
# TODO: Find method that covers majority of cases with high accuracy
paulaLine = []
intervieweeLine = []
unknownLine = []
for i in range(1,len(content)):
# if( len( content[i] ) >= 5 ):
# Split has a second parameter, maxsplit to specify number of splits
# since we only want first "word" or indicator s2 = line.split(' ', 1)[1]
firstWord = content[i].split(' ', 1)
if( not firstWord[0] ):
continue;
# print firstWord[0]
if( firstWord[0] == "P:" or firstWord[0] == "Paula:" ):
# print "This is Paula's line"
paulaLine.append(content[i])
elif( firstWord[0] == fname[0]+":" or firstWord[0] == fname+":" ):
# print "This is interviewee's line"
intervieweeLine.append(content[i])
else:
unknownLine.append(content[i])
#Having issues with unicode to string conversion
# for j in range (len(intervieweeLine)):
# map(unicode,intervieweeLine[j])
# print (j," ",intervieweeLine[j])
# parsedData = parser(intervieweeLine[j])
# for i, token in enumerate(parsedData):
# print("original:", token.orth, token.orth_)
# print("lowercased:", token.lower, token.lower_)
# print("lemma:", token.lemma, token.lemma_)
# print("shape:", token.shape, token.shape_)
# print("prefix:", token.prefix, token.prefix_)
# print("suffix:", token.suffix, token.suffix_)
# print("log probability:", token.prob)
# print("Brown cluster id:", token.cluster)
# print("----------------------------------------")
# if i > 1:
# break