-
Notifications
You must be signed in to change notification settings - Fork 1
/
audioNormalizer.py
127 lines (122 loc) · 4.92 KB
/
audioNormalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
import csv
import json
'''
This takes a CSV with one row per recording, that could have
an arbitrary number of columns for speakers and related film titles,
parses those related entities out, and creates a JSON object
with a dict per entity type. Each type will become a table in the
MySQL database.
'''
output = {
# the basic record for the db will be a 'recording'
# with these attributes:
'recordings' : {
# 'pk_recordingID' : {
# 'speakers' : [],
# 'filmTitles' : [],
# 'pictionID' : '',
# 'filename' : '',
# 'permissions' : '',
# 'date' : '',
# 'day' : '',
# 'month' : '',
# 'year' : '',
# 'location' : '',
# 'pfaSeries' : '',
# 'eventTitle' : '',
# 'eventNotes' : '',
# 'eventRecordingNotes' : ''
# 'tapeNumber' : '',
# 'tapeSide' : '',
# 'digitaStatus' : '',
# 'digitizer' : '',
# 'fileSize' : '',
# }
},
# each speaker is just a string of the complete personal or corporate name
'speakers' : {
# 'pk_speakerID' : {
# 'speakerValue' : '',
# 'fk_usedBy' : []
# }
},
'filmTitles' : {
# 'pk_filmID' : {
# 'filmTitleValue' : '',
# 'fk_usedBy' : []
# }
}
}
# these two counters start the speaker & film title ID sequences
speakersCounter = 1
filmTitlesCounter = 1
# start dicts of unique speakers and film titles to reference
# when building a recording record
existingSpeakers = {}
existingFilmTitles = {}
with open('data_files/combined-audio-data-cleaned.csv','r') as f:
reader = csv.DictReader(f)
for row in reader:
# the event ID is sequential from the combined CSV
recordingID = row['event ID']
output['recordings'][recordingID] = {}
# these lists will include the ID of each speaker and film title
# referenced in the recording
output['recordings'][recordingID]['speakers'] = []
output['recordings'][recordingID]['filmTitles'] = []
output['recordings'][recordingID]['pictionID'] = row['piction ID']
output['recordings'][recordingID]['filename'] = row['filename']
output['recordings'][recordingID]['permissions'] = row['permissions']
output['recordings'][recordingID]['permissionsLegacy'] = row['permissionsLegacy']
output['recordings'][recordingID]['permissionsNote'] = row['permissions note']
output['recordings'][recordingID]['date'] = row['iso 8601']
output['recordings'][recordingID]['day'] = row['day']
output['recordings'][recordingID]['month'] = row['month']
output['recordings'][recordingID]['year'] = row['year']
output['recordings'][recordingID]['location'] = row['recording location']
output['recordings'][recordingID]['pfaSeries'] = row['pfa series']
output['recordings'][recordingID]['eventTitle'] = row['event title']
output['recordings'][recordingID]['eventNotes'] = row['event notes']
output['recordings'][recordingID]['eventRecordingNotes'] = row['technical/recording notes']
output['recordings'][recordingID]['tapeNumber'] = row['tape number']
output['recordings'][recordingID]['tapeSide'] = row['tape side']
output['recordings'][recordingID]['digitalStatus'] = row['Born-digital status']
output['recordings'][recordingID]['digitizer'] = row['digitizer']
output['recordings'][recordingID]['fileSize'] = row['file size']
for key in row.keys():
if not row[key] == '':
theName = row[key]
if key.startswith('speakers'):
# for each SPEAKER column, add the value to the recording
# and also increment the speaker counter
if not theName in existingSpeakers:
output['speakers'][speakersCounter] = {}
output['speakers'][speakersCounter]['fk_usedBy'] = []
output['speakers'][speakersCounter]['speakerValue'] = theName
output['speakers'][speakersCounter]['fk_usedBy'].append(row['event ID'])
existingSpeakers[theName] = speakersCounter
output['recordings'][recordingID]['speakers'].append(str(speakersCounter))
speakersCounter += 1
else:
# or if the speaker already exists, just add the
theSpeakerID = existingSpeakers[theName]
output['recordings'][recordingID]['speakers'].append(str(theSpeakerID))
output['speakers'][theSpeakerID]['fk_usedBy'].append(row['event ID'])
elif key.startswith('film'):
# do the same for films as for speakers
if not theName in existingFilmTitles:
output['filmTitles'][filmTitlesCounter] = {}
output['filmTitles'][filmTitlesCounter]['fk_usedBy'] = []
output['filmTitles'][filmTitlesCounter]['filmTitleValue'] = theName
output['filmTitles'][filmTitlesCounter]['fk_usedBy'].append(row['event ID'])
existingFilmTitles[theName] = filmTitlesCounter
output['recordings'][recordingID]['filmTitles'].append(str(filmTitlesCounter))
filmTitlesCounter += 1
else:
theFilmID = existingFilmTitles[theName]
output['recordings'][recordingID]['filmTitles'].append(str(theFilmID))
output['filmTitles'][theFilmID]['fk_usedBy'].append(row['event ID'])
# print(output)
with open('data_files/combined-audio-data-cleaned.json','w+') as f:
json.dump(output, f)