/
01 convert freebase.py
92 lines (65 loc) · 2.46 KB
/
01 convert freebase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Filters the output from "00 download freebase.json" to match
the MovieWar JSON format. Removes movies with no name
or release date, etc...
"""
import json
import sys
from operator import itemgetter
# Information and error messages:
def outln(line):
""" Write 'line' to stdout, using the platform encoding and newline format. """
print(line, flush = True)
def errln(line):
""" Write 'line' to stderr, using the platform encoding and newline format. """
print('01 convert freebase.py: error:', line, file = sys.stderr, flush = True)
# Entry point:
def main():
input_movies = open('00 download freebase.json', 'r', encoding = 'utf-8').read().splitlines()
output_movies = []
for index, line in enumerate(input_movies):
movie = json.loads(line)
# validate name:
name = movie.get('name') or ''
name = name.strip()
if name == '':
outln('Empty name, skipping movie: {}'.format(index))
continue
# validate date:
date = movie.get('initial_release_date') or ''
date = date.strip()
if date == '':
outln('Empty date, skipping movie: {}'.format(name))
continue
# 'initial_release_date' is in the following format: (YYYY-MM-DD)
# for the trivia we need the year (and sometimes only the year is present)
year = date[:4]
# validate the year:
try:
year_as_number = int(year)
# all the movies in the 9200 movie set are supposed to be
# between 1900 and 2000:
if not (1900 <= year_as_number <= 2000):
outln('Invalid year range: {} for movie: {}, skipping...'.format(year, name))
continue
except ValueError:
outln('Invalid date: {}, for movie: {}'.format(date, name))
continue
# create a new JSON value, using the trivia format:
movie = { 'name': name, 'year': year }
output_movies.append(movie)
# sort the movies by name:
output_movies = sorted(output_movies, key = itemgetter('name'))
# save:
with open('01 convert freebase.json', 'wb') as descriptor:
for movie in output_movies:
jsonbytes = json.dumps(movie).encode('utf-8')
descriptor.write(jsonbytes)
descriptor.write(b'\n')
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass