Aveek-Saha · ldnovak · Jun 2, 2023 · Jun 2, 2023 · Jun 3, 2023
diff --git a/get_metadata.py b/get_metadata.py
@@ -9,12 +9,11 @@
 from unidecode import unidecode
 from tqdm.std import tqdm
 from fuzzywuzzy import fuzz
-
-import imdb
+from PyMovieDb import IMDB
 
 import config
 
-ia = imdb.IMDb()
+imdb = IMDB()
 
 f = open('sources.json', 'r')
 data = json.load(f)
@@ -161,24 +160,23 @@ def get_tmdb_from_id(id):
 
 def get_imdb(name):
     try:
-        movies = ia.search_movie(name)
-        if len(movies) > 0:
-            movie_id = movies[0].movieID
-            movie = movies[0]
-
-            if 'year' in movie:
-                release_date = movie['year']
-            else:
-                print("Field missing in response")
-                return {}
+        movie = imdb.get_by_name(name)
+        movie = json.loads(movie)
+        if movie == imdb.NA:
+            return {}
 
-            return {
-                "title": unidecode(movie['title']),
-                "release_date": release_date,
-                "id": movie_id,
-            }
+        if 'datePublished' in movie:
+            release_date = movie['datePublished']
         else:
+            print("datePublished missing in response")
             return {}
+        movie_id = movie["url"].split("/")[-2]
+        return {
+            "title": unidecode(movie['name']),
+            "release_date": release_date,
+            "id": movie_id,
+            "overview": movie["description"] if "description" in movie else "",
+        }
     except Exception as err:
         print(err)
         return {}
@@ -194,6 +192,7 @@ def get_imdb(name):
 
 unique = []
 origin = {}
+names_with_bad_files = []
 for source in metadata:
     DIR = join("scripts", "unprocessed", source)
     files = [join(DIR, f) for f in listdir(DIR) if isfile(
@@ -211,13 +210,13 @@ def get_imdb(name):
         name = roman_to_int(name)
         name = unidecode(name)
         unique.append(name)
-        if name not in origin:
-            origin[name] = {"files": []}
         curr_script = metadata[source][script]
         curr_file = join("scripts", "unprocessed", source,
                          curr_script["file_name"] + ".txt")
 
         if curr_file in files:
+            if name not in origin:
+                origin[name] = {"files": []}
             origin[name]["files"].append({
                 "name": unidecode(script),
                 "source": source,
@@ -226,9 +225,6 @@ def get_imdb(name):
                 "size": getsize(curr_file)
             })
 
-        else:
-            origin.pop(name)
-
 final = sorted(list(set(unique)))
 print(len(final))
 
@@ -294,7 +290,7 @@ def get_imdb(name):
 for script in tqdm(origin):
     if "imdb" in origin[script] and "tmdb" not in origin[script]:
         # print(origin[script]["files"][0]["name"])
-        imdb_id = "tt" + origin[script]["imdb"]["id"]
+        imdb_id = origin[script]["imdb"]["id"]
         movie_data = get_tmdb_from_id(imdb_id)
         if movie_data:
             origin[script]["tmdb"] = movie_data
@@ -318,7 +314,7 @@ def get_imdb(name):
         file_name = extra_clean(origin[script]["files"][0]["name"])
 
         if imdb_name != tmdb_name and average_ratio(file_name, tmdb_name) < 85 and average_ratio(file_name, imdb_name) > 85:
-            imdb_id = "tt" + origin[script]["imdb"]["id"]
+            imdb_id = origin[script]["imdb"]["id"]
             movie_data = get_tmdb_from_id(imdb_id)
             if movie_data:
                 origin[script]["tmdb"] = movie_data

diff --git a/get_scripts.py b/get_scripts.py
@@ -6,26 +6,33 @@
 
 DIR = os.path.join("scripts", "temp")
 
-if not os.path.exists(DIR):
-    os.makedirs(DIR)
-
-f = open('sources.json', 'r')
-data = json.load(f)
-processes = []
-starttime = time.time()
-
-for source in data:
-    included = data[source]
-    if included == "true":
-        # print("Fetching scripts from %s" % (source))
-        # sources.get_scripts(source=source)
-        # print()
-        p = multiprocessing.Process(target=sources.get_scripts, args=(source,))
-        processes.append(p)
-        p.start()
-
-for process in processes:
-    process.join()
-
-print()    
-print('Time taken = {} seconds'.format(time.time() - starttime))
+def get_scripts():
+    if not os.path.exists(DIR):
+        os.makedirs(DIR)
+
+    f = open('sources.json', 'r')
+    data = json.load(f)
+    processes = []
+    starttime = time.time()
+
+    multiprocessing.freeze_support()
+
+    for source in data:
+        included = data[source]
+        if included == "true":
+            # print("Fetching scripts from %s" % (source))
+            # sources.get_scripts(source=source)
+            # print()
+            p = multiprocessing.Process(target=sources.get_scripts, args=(source,))
+            processes.append(p)
+            p.start()
+
+    for process in processes:
+        process.join()
+    print()    
+    print('Time taken = {} seconds'.format(time.time() - starttime))
+
+
+if __name__ == '__main__':
+    get_scripts()
+
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,8 @@ tqdm==4.61.1
 fuzzywuzzy==0.18.0
 Unidecode==1.2.0
 textract==1.6.3
-beautifulsoup4==4.9.3
+beautifulsoup4==4.8.0
 IMDbPY==2021.4.18
+numpy==1.24.3
+python-Levenshtein==0.21.0
+PyMovieDb==0.0.8
diff --git a/sources/scriptsavant.py b/sources/scriptsavant.py
@@ -7,8 +7,7 @@
 
 
 def get_scriptsavant():
-    ALL_URL_1 = "https://thescriptsavant.com/free-movie-screenplays-am/"
-    ALL_URL_2 = "https://thescriptsavant.com/free-movie-screenplays-nz/"
+    ALL_URL = "https://thescriptsavant.com/movies.html"
     BASE_URL = "http://www.thescriptsavant.com/"
     SOURCE = "scriptsavant"
     DIR, TEMP_DIR, META_DIR = create_script_dirs(SOURCE)
@@ -17,15 +16,11 @@ def get_scriptsavant():
         os.path.join(DIR, f)) and os.path.getsize(os.path.join(DIR, f)) > 3000]
 
     metadata = {}
-    soup_1 = get_soup(ALL_URL_1)
-    soup_2 = get_soup(ALL_URL_2)
-
-    movielist = soup_1.find_all('tbody')[0].find_all('a')
-    movielist_2 = soup_2.find_all('div', class_='fusion-text')[0].find_all('a')
-    movielist += movielist_2
+    soup = get_soup(ALL_URL)
+    movielist = soup.find_all('a')
 
     for movie in tqdm(movielist, desc=SOURCE):
-        name = movie.text.replace("script", "").strip()
+        name = movie.text.replace("script", "").replace("Script", "").strip()
         file_name = format_filename(name)
         script_url = movie.get('href')
 
@@ -42,7 +37,7 @@ def get_scriptsavant():
             continue
 
         try:
-            text = get_pdf_text(script_url, os.path.join(SOURCE, file_name))
+            text = get_pdf_text(os.path.join(BASE_URL, script_url), os.path.join(SOURCE, file_name))
 
         except Exception as err:
             print(script_url)