In [None]:
import pandas as pd

movies = pd.read_csv('data/base_movies.csv')

movies.head()

In [None]:
import requests
import os

IMDB_API_KEY = os.getenv("OMDB_API_KEY")

def get_imdb_data_by_title_year(title: str, year: int | None):

    params = {
        "t": title,
        "apikey": IMDB_API_KEY,
        "y": year
    }

    try:
        response = requests.get("http://www.omdbapi.com/", params=params, timeout=10)
        data = response.json()
    except Exception as e:
        print(f"Request error for '{title}' ({year}): {e}")
        return None

    if data.get("Response") == "True":
        return data
    else:
        return None


In [None]:
data = get_imdb_data_by_title_year(movies['title'][0], movies['year'][0])
data

In [None]:
len(movies)

In [None]:
import time
import os
import pandas as pd

NOT_ACCEPTED_FILE = "data/error_imdb.csv"

if os.path.exists(NOT_ACCEPTED_FILE):
    na_df = pd.read_csv(NOT_ACCEPTED_FILE)
    not_accepted_set = set(na_df["raw_title"].tolist())
    print(f"Loaded {len(not_accepted_set)} previously rejected movies")
else:
    not_accepted_set = set()
    print("No error_imdb.csv found — skipping")

OUTPUT_FILE = "../stage3_normalize/data/imdb_data.csv"

if os.path.exists(OUTPUT_FILE):
    imdb_df = pd.read_csv(OUTPUT_FILE)
    processed_ids = set(imdb_df["movie_id"].tolist())
    print(f"Loaded existing imdb_df with {len(imdb_df)} records")
else:
    imdb_df = pd.DataFrame(columns=[
        "movie_id", "rated", "director", "actors",
        "plot", "country", "imdb_rating", "imdb_id"
    ])
    processed_ids = set()
    print("Started fresh: no imdb_data.csv found")

rows = []
REQUEST_LIMIT = 950
requests_done = 0

for _, row in movies.iterrows():
    movie_id = row["movie_id"]

    if movie_id in processed_ids:
        continue

    title = row["title"]
    year_val = row["year"]

    raw_title = f"{title} ({year_val})"

    if raw_title in not_accepted_set:
        print(f"SKIP (in not accepted): {raw_title}")
        continue

    if requests_done >= REQUEST_LIMIT:
        print("Daily API limit reached, stopping.")
        break

    data = get_imdb_data_by_title_year(title, year_val)
    requests_done += 1

    if data is None:
        print(f"Not found: {raw_title}")

        not_accepted_set.add(raw_title)

        pd.DataFrame(sorted(list(not_accepted_set)), columns=["raw_title"]) \
            .to_csv(NOT_ACCEPTED_FILE, index=False)

        continue

    imdb_row = {
        "movie_id": movie_id,
        "rated": data.get("Rated"),
        "director": data.get("Director"),
        "actors": data.get("Actors"),
        "plot": data.get("Plot"),
        "country": data.get("Country"),
        "imdb_rating": data.get("imdbRating"),
        "imdb_id": data.get("imdbID"),
    }

    rows.append(imdb_row)
    processed_ids.add(movie_id)

    imdb_df = pd.concat([imdb_df, pd.DataFrame([imdb_row])], ignore_index=True)

    imdb_df.to_csv(OUTPUT_FILE, index=False)

    print(f"Saved movie {movie_id}   ({requests_done}/{REQUEST_LIMIT})")

    time.sleep(0.2)


print("Done! Total IMDb records now:", len(imdb_df))


In [None]:
len(imdb_df)

In [None]:
imdb_df

In [None]:
len(movies)

In [None]:
not_accepted_movies = movies.loc[
    ~movies["movie_id"].isin(imdb_df["movie_id"]),
    ["movie_id", "title", "year"]
]
not_accepted_movies.to_csv("not_accepted_movies.csv", index=False)

Ручная правка названий

In [None]:
len(imdb_df)

In [None]:
imdb_df