In [1]:
import pandas as pd

movies = pd.read_csv('data/base_movies.csv')

movies.head()

Unnamed: 0,movie_id,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [5]:
import requests

IMDB_API_KEY = "3b24f5c9"

def get_imdb_data_by_title_year(title: str, year: int | None):

    params = {
        "t": title,
        "apikey": IMDB_API_KEY,
        "y": year
    }

    try:
        response = requests.get("http://www.omdbapi.com/", params=params, timeout=10)
        data = response.json()
    except Exception as e:
        print(f"Request error for '{title}' ({year}): {e}")
        return None

    if data.get("Response") == "True":
        return data
    else:
        return None


In [21]:
data = get_imdb_data_by_title_year(movies['title'][0], movies['year'][0])
data

{'Title': 'Toy Story',
 'Year': '1995',
 'Rated': 'G',
 'Released': '22 Nov 1995',
 'Runtime': '81 min',
 'Genre': 'Animation, Adventure, Comedy',
 'Director': 'John Lasseter',
 'Writer': 'Joss Whedon, Andrew Stanton, Joel Cohen',
 'Actors': 'Tom Hanks, Tim Allen, Don Rickles',
 'Plot': "A cowboy doll is profoundly jealous when a new spaceman action figure supplants him as the top toy in a boy's bedroom. When circumstances separate them from their owner, the duo have to put aside their differences to return to him.",
 'Language': 'English',
 'Country': 'United States',
 'Awards': 'Nominated for 3 Oscars. 29 wins & 24 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BZTA3OWVjOWItNjE1NS00NzZiLWE1MjgtZDZhMWI1ZTlkNzYwXkEyXkFqcGc@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '100%'},
  {'Source': 'Metacritic', 'Value': '96/100'}],
 'Metascore': '96',
 'imdbRating': '8.3',
 'imdbVotes': '

In [5]:
len(movies)

3883

In [6]:
import time
import os
import pandas as pd

NOT_ACCEPTED_FILE = "data/error_imdb.csv"

if os.path.exists(NOT_ACCEPTED_FILE):
    na_df = pd.read_csv(NOT_ACCEPTED_FILE)
    not_accepted_set = set(na_df["raw_title"].tolist())
    print(f"Loaded {len(not_accepted_set)} previously rejected movies")
else:
    not_accepted_set = set()
    print("No error_imdb.csv found — skipping")

OUTPUT_FILE = "../stage3_normalize/data/imdb_data.csv"

if os.path.exists(OUTPUT_FILE):
    imdb_df = pd.read_csv(OUTPUT_FILE)
    processed_ids = set(imdb_df["movie_id"].tolist())
    print(f"Loaded existing imdb_df with {len(imdb_df)} records")
else:
    imdb_df = pd.DataFrame(columns=[
        "movie_id", "rated", "director", "actors",
        "plot", "country", "imdb_rating", "imdb_id"
    ])
    processed_ids = set()
    print("Started fresh: no imdb_data.csv found")

rows = []
REQUEST_LIMIT = 950
requests_done = 0

for _, row in movies.iterrows():
    movie_id = row["movie_id"]

    if movie_id in processed_ids:
        continue

    title = row["title"]
    year_val = row["year"]

    raw_title = f"{title} ({year_val})"

    if raw_title in not_accepted_set:
        print(f"SKIP (in not accepted): {raw_title}")
        continue

    if requests_done >= REQUEST_LIMIT:
        print("Daily API limit reached, stopping.")
        break

    data = get_imdb_data_by_title_year(title, year_val)
    requests_done += 1

    if data is None:
        print(f"Not found: {raw_title}")

        not_accepted_set.add(raw_title)

        pd.DataFrame(sorted(list(not_accepted_set)), columns=["raw_title"]) \
            .to_csv(NOT_ACCEPTED_FILE, index=False)

        continue

    imdb_row = {
        "movie_id": movie_id,
        "rated": data.get("Rated"),
        "director": data.get("Director"),
        "actors": data.get("Actors"),
        "plot": data.get("Plot"),
        "country": data.get("Country"),
        "imdb_rating": data.get("imdbRating"),
        "imdb_id": data.get("imdbID"),
    }

    rows.append(imdb_row)
    processed_ids.add(movie_id)

    imdb_df = pd.concat([imdb_df, pd.DataFrame([imdb_row])], ignore_index=True)

    imdb_df.to_csv(OUTPUT_FILE, index=False)

    print(f"Saved movie {movie_id}   ({requests_done}/{REQUEST_LIMIT})")

    time.sleep(0.2)


print("Done! Total IMDb records now:", len(imdb_df))


Loaded 1835 previously rejected movies
Loaded existing imdb_df with 2048 records
SKIP (in not accepted): American President, The (1995)
SKIP (in not accepted): City of Lost Children, The (1995)
SKIP (in not accepted): Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
SKIP (in not accepted): Twelve Monkeys (1995)
SKIP (in not accepted): Seven (Se7en) (1995)
SKIP (in not accepted): Usual Suspects, The (1995)
SKIP (in not accepted): Big Green, The (1995)
SKIP (in not accepted): Postino, Il (The Postman) (1994)
SKIP (in not accepted): Confessional, The (Le Confessionnal) (1995)
SKIP (in not accepted): Indian in the Cupboard, The (1995)
SKIP (in not accepted): French Twist (Gazon maudit) (1995)
SKIP (in not accepted): Misérables, Les (1995)
SKIP (in not accepted): Crossing Guard, The (1995)
SKIP (in not accepted): Juror, The (1996)
SKIP (in not accepted): White Balloon, The (Badkonake Sefid ) (1995)
SKIP (in not accepted): Antonia's Line (Antonia) (1995)
SKIP (in not accepted): Journey o

In [7]:
len(imdb_df)

2048

In [8]:
imdb_df

Unnamed: 0,movie_id,rated,director,actors,plot,country,imdb_rating,imdb_id
0,1,G,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles",A cowboy doll is profoundly jealous when a new...,United States,8.3,tt0114709
1,2,PG,Joe Johnston,"Robin Williams, Kirsten Dunst, Bonnie Hunt",A magic board game summons jungle perils and a...,United States,7.1,tt0113497
2,3,PG-13,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret",Just as John and Max resolve to save their bel...,United States,6.7,tt0113228
3,4,R,Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devine","Based on Terry McMillan's novel, this film fol...",United States,6.0,tt0114885
4,5,PG,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short",George Banks must deal not only with his daugh...,United States,6.1,tt0113041
...,...,...,...,...,...,...,...,...
2043,3947,R,Mike Hodges,"Michael Caine, Ian Hendry, Britt Ekland",When his brother dies under mysterious circums...,United Kingdom,7.3,tt0067128
2044,3948,PG-13,Jay Roach,"Ben Stiller, Robert De Niro, Teri Polo",Chicago male nurse and chronic under-achiever ...,United States,7.0,tt0212338
2045,3949,NC-17,Darren Aronofsky,"Ellen Burstyn, Jared Leto, Jennifer Connelly",The drug-induced utopias of four Coney Island ...,United States,8.3,tt0180093
2046,3950,R,Joel Schumacher,"Colin Farrell, Matthew Davis, Clifton Collins Jr.",A group of recruits go through Advanced Infant...,"Germany, United States",6.9,tt0170691


In [9]:
len(movies)

3883

In [10]:
not_accepted_movies = movies.loc[
    ~movies["movie_id"].isin(imdb_df["movie_id"]),
    ["movie_id", "title", "year"]
]
not_accepted_movies.to_csv("not_accepted_movies.csv", index=False)

Ручная правка названий

In [22]:
len(imdb_df)

3461

In [23]:
imdb_df

Unnamed: 0,movie_id,rated,director,actors,plot,country,imdb_rating,imdb_id
0,1,G,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles",A cowboy doll is profoundly jealous when a new...,United States,8.3,tt0114709
1,2,PG,Joe Johnston,"Robin Williams, Kirsten Dunst, Bonnie Hunt",A magic board game summons jungle perils and a...,United States,7.1,tt0113497
2,3,PG-13,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret",Just as John and Max resolve to save their bel...,United States,6.7,tt0113228
3,4,R,Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devine","Based on Terry McMillan's novel, this film fol...",United States,6.0,tt0114885
4,5,PG,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short",George Banks must deal not only with his daugh...,United States,6.1,tt0113041
...,...,...,...,...,...,...,...,...
3456,3931,Approved,Ray Kellogg,"Don Sullivan, Fred Graham, Lisa Simone",A giant lizard terrorizes a rural Texas commun...,United States,3.7,tt0052846
3457,3932,Approved,James Whale,"Claude Rains, Gloria Stuart, William Harrigan","A scientist finds a way of becoming invisible,...",United States,7.6,tt0024184
3458,3933,Approved,Ray Kellogg,"James Best, Ingrid Goude, Ken Curtis","On an isolated island, a small group of people...",United States,4.1,tt0052969
3459,3938,R,Amy Holden Jones,"Michele Michaels, Robin Stille, Michael Villella",A female high school student's slumber party t...,United States,5.6,tt0084695
