-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_videos_file.py
84 lines (75 loc) · 3.35 KB
/
filter_videos_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import sys
def echo_msg(msg, fh):
fh.write(f'{msg}\n'); print(msg)
def parse_line(line, header=None):
row = [c.strip() for c in line.strip().split('\t')]
if header:
row = {header[ci]: c for c, ci in zip(row, range(len(row))) if ci < len(header)}
else:
row = [c.split()[0] for c in row]
return row
def cast_str_as_val(string, rtype=str):
try:
return rtype(string) if string else None
except Exception as e:
print(string)
raise e
def main(args):
logfile = './filter_videos.log'
with open(logfile, 'a+', encoding='utf-8') as logh, open(args.infile, 'r', encoding='utf-8') as ifh, open(args.outfile, 'w+', encoding='utf-8') as ofh:
echo_msg('loading input file', logh)
filtered_videos = 0
filtered_channels = 0
skipped_channels = set()
# parse channels
ifh.seek(0)
fields = []
currentmode = None
while line := ifh.readline():
strippedline = line.strip()
if strippedline in ['', '[CHANNELS]', '[VIDEOS]']:
if not strippedline: continue
elif strippedline == '[CHANNELS]':
currentmode = 'c'; ofh.write(line)
line = ifh.readline(); ofh.write(line)
fields = parse_line(line.strip()) # update header
elif strippedline == '[VIDEOS]': currentmode = 'v'
elif currentmode == 'c':
row = parse_line(line, fields)
if row.get('include') == 'n' or (not row.get('channel_id')):
echo_msg(f'skipping channel {row.get("channel_id")} ({row.get("title")})', logh)
skipped_channels.update({row.get('channel_id')})
filtered_channels += 1; continue
else:
ofh.write(line)
# parse videos
ifh.seek(0)
fields = []
currentmode = None
while line := ifh.readline():
strippedline = line.strip()
if strippedline in ['', '[CHANNELS]', '[VIDEOS]']:
if not strippedline: continue
elif strippedline == '[CHANNELS]': currentmode = 'c'
elif strippedline == '[VIDEOS]':
currentmode = 'v'; ofh.write(line)
line = ifh.readline(); ofh.write(line)
fields = parse_line(line.strip()) # update header
elif currentmode == 'v':
row = parse_line(line, fields)
if row.get('include') == 'n' or (not row.get('video_id')) or ((row.get('channel_id') or 'UNSET_CHANNEL_ID') in skipped_channels):
echo_msg(f'skipping video {row.get("video_id")} ({row.get("title")})', logh)
filtered_videos += 1; continue
else:
ofh.write(line)
echo_msg(f'filtered {filtered_videos} videos and {filtered_channels} channels', logh)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('infile', help='input videos/channels tsv')
parser.add_argument('outfile', help='output filtered tsv')
parser.add_argument('--channels', action='store_true', help='only copy channel rows')
if len(sys.argv)==1:
parser.print_help(sys.stderr); exit()
args = parser.parse_args()
main(args)