Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update requirements, scriptsavant, and get_scripts to allow them to run #3

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 21 additions & 25 deletions get_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
from unidecode import unidecode
from tqdm.std import tqdm
from fuzzywuzzy import fuzz

import imdb
from PyMovieDb import IMDB

import config

ia = imdb.IMDb()
imdb = IMDB()

f = open('sources.json', 'r')
data = json.load(f)
Expand Down Expand Up @@ -161,24 +160,23 @@ def get_tmdb_from_id(id):

def get_imdb(name):
try:
movies = ia.search_movie(name)
if len(movies) > 0:
movie_id = movies[0].movieID
movie = movies[0]

if 'year' in movie:
release_date = movie['year']
else:
print("Field missing in response")
return {}
movie = imdb.get_by_name(name)
movie = json.loads(movie)
if movie == imdb.NA:
return {}

return {
"title": unidecode(movie['title']),
"release_date": release_date,
"id": movie_id,
}
if 'datePublished' in movie:
release_date = movie['datePublished']
else:
print("datePublished missing in response")
return {}
movie_id = movie["url"].split("/")[-2]
return {
"title": unidecode(movie['name']),
"release_date": release_date,
"id": movie_id,
"overview": movie["description"] if "description" in movie else "",
}
except Exception as err:
print(err)
return {}
Expand All @@ -194,6 +192,7 @@ def get_imdb(name):

unique = []
origin = {}
names_with_bad_files = []
for source in metadata:
DIR = join("scripts", "unprocessed", source)
files = [join(DIR, f) for f in listdir(DIR) if isfile(
Expand All @@ -211,13 +210,13 @@ def get_imdb(name):
name = roman_to_int(name)
name = unidecode(name)
unique.append(name)
if name not in origin:
origin[name] = {"files": []}
curr_script = metadata[source][script]
curr_file = join("scripts", "unprocessed", source,
curr_script["file_name"] + ".txt")

if curr_file in files:
if name not in origin:
origin[name] = {"files": []}
origin[name]["files"].append({
"name": unidecode(script),
"source": source,
Expand All @@ -226,9 +225,6 @@ def get_imdb(name):
"size": getsize(curr_file)
})

else:
origin.pop(name)

final = sorted(list(set(unique)))
print(len(final))

Expand Down Expand Up @@ -294,7 +290,7 @@ def get_imdb(name):
for script in tqdm(origin):
if "imdb" in origin[script] and "tmdb" not in origin[script]:
# print(origin[script]["files"][0]["name"])
imdb_id = "tt" + origin[script]["imdb"]["id"]
imdb_id = origin[script]["imdb"]["id"]
movie_data = get_tmdb_from_id(imdb_id)
if movie_data:
origin[script]["tmdb"] = movie_data
Expand All @@ -318,7 +314,7 @@ def get_imdb(name):
file_name = extra_clean(origin[script]["files"][0]["name"])

if imdb_name != tmdb_name and average_ratio(file_name, tmdb_name) < 85 and average_ratio(file_name, imdb_name) > 85:
imdb_id = "tt" + origin[script]["imdb"]["id"]
imdb_id = origin[script]["imdb"]["id"]
movie_data = get_tmdb_from_id(imdb_id)
if movie_data:
origin[script]["tmdb"] = movie_data
Expand Down
53 changes: 30 additions & 23 deletions get_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,33 @@

DIR = os.path.join("scripts", "temp")

if not os.path.exists(DIR):
os.makedirs(DIR)

f = open('sources.json', 'r')
data = json.load(f)
processes = []
starttime = time.time()

for source in data:
included = data[source]
if included == "true":
# print("Fetching scripts from %s" % (source))
# sources.get_scripts(source=source)
# print()
p = multiprocessing.Process(target=sources.get_scripts, args=(source,))
processes.append(p)
p.start()

for process in processes:
process.join()

print()
print('Time taken = {} seconds'.format(time.time() - starttime))
def get_scripts():
if not os.path.exists(DIR):
os.makedirs(DIR)

f = open('sources.json', 'r')
data = json.load(f)
processes = []
starttime = time.time()

multiprocessing.freeze_support()

for source in data:
included = data[source]
if included == "true":
# print("Fetching scripts from %s" % (source))
# sources.get_scripts(source=source)
# print()
p = multiprocessing.Process(target=sources.get_scripts, args=(source,))
processes.append(p)
p.start()

for process in processes:
process.join()
print()
print('Time taken = {} seconds'.format(time.time() - starttime))


if __name__ == '__main__':
get_scripts()

5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,8 @@ tqdm==4.61.1
fuzzywuzzy==0.18.0
Unidecode==1.2.0
textract==1.6.3
beautifulsoup4==4.9.3
beautifulsoup4==4.8.0
IMDbPY==2021.4.18
numpy==1.24.3
python-Levenshtein==0.21.0
PyMovieDb==0.0.8
15 changes: 5 additions & 10 deletions sources/scriptsavant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@


def get_scriptsavant():
ALL_URL_1 = "https://thescriptsavant.com/free-movie-screenplays-am/"
ALL_URL_2 = "https://thescriptsavant.com/free-movie-screenplays-nz/"
ALL_URL = "https://thescriptsavant.com/movies.html"
BASE_URL = "http://www.thescriptsavant.com/"
SOURCE = "scriptsavant"
DIR, TEMP_DIR, META_DIR = create_script_dirs(SOURCE)
Expand All @@ -17,15 +16,11 @@ def get_scriptsavant():
os.path.join(DIR, f)) and os.path.getsize(os.path.join(DIR, f)) > 3000]

metadata = {}
soup_1 = get_soup(ALL_URL_1)
soup_2 = get_soup(ALL_URL_2)

movielist = soup_1.find_all('tbody')[0].find_all('a')
movielist_2 = soup_2.find_all('div', class_='fusion-text')[0].find_all('a')
movielist += movielist_2
soup = get_soup(ALL_URL)
movielist = soup.find_all('a')

for movie in tqdm(movielist, desc=SOURCE):
name = movie.text.replace("script", "").strip()
name = movie.text.replace("script", "").replace("Script", "").strip()
file_name = format_filename(name)
script_url = movie.get('href')

Expand All @@ -42,7 +37,7 @@ def get_scriptsavant():
continue

try:
text = get_pdf_text(script_url, os.path.join(SOURCE, file_name))
text = get_pdf_text(os.path.join(BASE_URL, script_url), os.path.join(SOURCE, file_name))

except Exception as err:
print(script_url)
Expand Down