/
CLANtoR.R
174 lines (146 loc) · 5.07 KB
/
CLANtoR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
###############################################################
## Melissa's R converter for CHILDES-CLAN files
##
## This is an R function that takes a single CLAN-formatted
## corpus file (*.cha) and returns a dataframe. Takes <30s
## per file, around 10 min for the Eve(Brown) Corpus.
## Could be faster :)
##
## The dataframe has a row for each sentence in the corpus, with the
## following columns:
##
## From the file header & child participant annotation:
## FileName
## Participants (A string with the full list)
## Language
## Corpus
## Age (child's)
## Gender (ditto)
## Date
## File.Situation
##
## From the line itself
## Speaker
## Verbatim sentence
## Tiers: any that we find!
##
## Also includes a gloss calculated from the utterance line, which
## gets rid of clarification sequences ("a bobby [= a thin bobbypin]" -> "a bobby")
## replaces 1-word glosses ("dunno [: don't know]" -> "don't know"), and
## cleans up various CHILDES-internal markup. Ideally, this yields a gloss
## with the same number of words as the mor line.
##
## This gloss is designed for presenting sentences to adult readers, though
## the form given may still want some processing (deleting sentence-final
## space, replacing internal "." with ",", continuation "+...") - these are left
## as-is here for ease of alignment against the mor and other tiers.
## Watch out for "," '"' and "'" when converting to/from csv
###############################################################
library(plyr)
read.CLAN.file <- function(f) {
tmp <- readLines(f)
print(f)
#Cycle through utterances and make a line for each.
alltext <- paste(tmp, collapse="\n")
utts <- unlist(strsplit(alltext, "[*]"))
utts <- utts[-1]
tierlist <- sapply(utts, get_utt_info)
data <- rbind.fill(tierlist)
#Collect the data that will be appended to every line
data$Filename <- f
p <- grep("@Participants", tmp, fixed=TRUE)
data$Participants <- unlist(strsplit(tmp[p], "\t"))[2]
p <- grep("@Date", tmp, fixed=TRUE)
data$Date <- unlist(strsplit(tmp[p], "\t"))[2]
p <- grep("@Situation", tmp, fixed=TRUE)
data$File.Situation <- unlist(strsplit(tmp[p], "\t"))[2]
p <- grep("Target_Child", tmp, fixed=TRUE)
chiline <- tmp[p[2]]
chidata <- unlist(strsplit(chiline, "[|]"))
data$Language <- substr(chidata[1], 6,9)
data$Corpus <- chidata[2]
data$Age <- chidata[4]
data$Gender <- chidata[5]
#Add utt line numbers
data$Line.No
#Get rid of some yucky processing columns we don't want
data$t.as.matrix.fields.. <- NULL
xnums <- as.numeric(gsub("[^0-9]*[0-9]*[^0-9]*[0-9]*[^0-9]*[0-9]*[^0-9]+", "", names(data), perl=T)) # what a hack
for(x in min(xnums, na.rm=T):max(xnums, na.rm=T)) {
xname <- paste("X", x, sep="")
data <- data[,!(names(data) %in% xname)]
}
#Make sure row names are preserved!
data$Utt.Number <- row.names(data)
#Return
data
} #End read.CLAN.file
get_utt_info <- function(u){
#Divide the line into individual utterances & tiers
fields <- unlist(strsplit(u, "[%]"))
#Make a dataframe
myrow <- data.frame(t(as.matrix(fields)))
#Add utterance info
myrow$Speaker <- substr(fields[1], 1,3)
myrow$Verbatim <- substr(fields[1], 6,nchar(fields[1])-1)
#Add info from any tiers, as they appear in the file
if (length(fields) > 1){
for (j in 2:length(fields)){
tier <- data.frame(substr(fields[j], 6,nchar(fields[j])-1))
names(tier) <- c(substr(fields[j], 1,3))
myrow <- cbind(myrow, tier)
}
}
#Some extra work: get the line as spoken, with glosses replaced
#...This is an adult-language, human-readable version of the utterance
myrow$Gloss <- NA
#First, find & replace sequences like this: "dunno [: don't know]" -> "don't know"
words <- unlist(strsplit(myrow$Verbatim, " "))
if (length(words) == 0){
words <- c("")
}
words <- unlist(strsplit(words, "\t"))
if (length(words) == 0){
words <- c("")
}
words <- unlist(strsplit(words, "\n"))
if (length(words) == 0){
words <- c("")
}
w <- 1
wmax <- length(words) + 1
while (w < wmax){
#Did we hit a gloss sequence?
if ((words[w]=="[:")|(words[w]=="[=?")){
#Find where the gloss ends, then clean up
closebracket <- grep("]", words[w:length(words)], fixed=TRUE)[1] + (w-1)
words[w-1] <- ""
words[w] <- ""
words[closebracket] <- substr(words[closebracket], 1, nchar(words[closebracket])-1)
}
w <- w + 1
}
#Next, find & replace clarification/elaboration sequences like this: "a bobby [= a thin bobbypin]" -> "a bobby"
w <- 1
wmax <- length(words) + 1
while (w < wmax){
#Did we hit a gloss sequence?
if ((substr(words[w],1,1) == "[")){
#Find where the gloss ends, then clean up
closebracket <- grep("]", words[w:length(words)], fixed=TRUE)[1] + (w-1)
goo <- closebracket
for (v in w:closebracket){
words[v] <- ""
}
}
w <- w + 1
}
#Next, delete internal notation we don't need here
words <- as.vector(mapply(gsub, "[()<>&@:]","",words))
#Remove sentence-internal periods!
words[1:(length(words)-1)] <- as.vector(mapply(gsub, "[.]","",words[1:(length(words)-1)]))
myrow$Gloss <- paste(words, collapse=" ")
myrow$Gloss <- gsub(" +", " ", myrow$Gloss)
#Return
myrow
} #END get_utt_info