In [210]:
import requests
from dotenv import load_dotenv
import os
import difflib
import pandas as pd
from pathlib import Path
import datetime
from tqdm import tqdm
import logging

In [None]:
load_dotenv()

OMDB_API_KEY = os.getenv("OMDB_API_KEY")
OMDB_URL = "http://www.omdbapi.com/"

logging.basicConfig(
    filename="movie_ratings.log",   
    level=logging.INFO,             
    format="%(asctime)s - %(levelname)s - %(message)s" 
)

tqdm.pandas() # Bar progress on pandas

In [212]:
def check_authentication():
    
    params = {
        "apikey" : OMDB_API_KEY
    }
    
    response = requests.get(OMDB_URL, params=params)
    
    return response.status_code == 200

###

print(check_authentication())

True


In [253]:
def get_data(data_type, original_title, release_year=None, threshold=0.6):
    """
        Calls OMDB API to retrieve information about a movie/show. If the name of the movie/show recieved doesn't match 
        enough with the name of the result movie/show, then it is not considered the same movie/show.
        
        - Type can be 'movie' or 'series'
    """
    
    params = {
        "apikey" : OMDB_API_KEY,
        "type": data_type,
        "t" : original_title,
        "y" : release_year,
        "r" : "json"
    }
    
    response = requests.get(OMDB_URL, params=params)
    
    if response.status_code == 200:
        
        data = response.json()
        if data.get("Response") == "True":
            data_title = data.get("Title").lower()
            input_title = original_title.lower()
            
            similarity = difflib.SequenceMatcher(None, input_title, data_title).ratio()
            
            if similarity >= threshold:
                logging.info(f"Match acepted: '{input_title}' vs '{data_title}' ({similarity:.2f})")
                return data
            else:
                logging.info(f"Low similarity: '{input_title}' vs '{data_title}' ({similarity:.2f})")
                return None
        else:
            logging.info(f"No match found for: '{data_type}' - '{original_title}' - '{release_year}'")
            return None
    else:
        logging.info(f"OMDB - {data_type} - {original_title} - Error: {response.status_code}")
        return None
    
def get_rating(data_type, title, release_year=None):
    result = get_data(data_type, title, release_year)
    
    if result != None:
        return result.get('imdbRating')
        
###

display(get_rating("movie", "Guardians of the Galaxy Vol. 2"))
display(get_rating("movie", "Breaking Bad"))
display(get_data("series", "Breaking"))


None

None

None

In [None]:
BASE_DIR = Path.cwd().parent.parent
RAW_DIR = BASE_DIR / "data" / "1_raw"

movies_absolute_path = f"{RAW_DIR}\\TMDB\\TMDB_top_rated_movies.csv"
shows_absolute_path = f"{RAW_DIR}\\TMDB\\TMDB_top_rated_shows.csv"


TMDB_movies = pd.read_csv(movies_absolute_path, sep=';', index_col=0)
TMDB_shows = pd.read_csv(shows_absolute_path, sep=';', index_col=0)

# Fill NAs on date
TMDB_movies.fillna({"release_date":'1111-11-11'}, inplace=True) 
TMDB_shows.fillna({"first_air_date":'1111-11-11'}, inplace=True)

# Extract year from the date
TMDB_movies["year"] = TMDB_movies["release_date"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").date().year)
TMDB_shows["year"] = TMDB_shows["first_air_date"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").date().year)

# OMDB has a limit of 1000 API request per day for Free accounts
movies_enrichment = TMDB_movies[["id", "title_EN", "year"]][:500]
shows_enrichment = TMDB_shows[["id", "title_EN", "year"]][:500]


In [None]:
movies_enrichment['imdb_rating'] = movies_enrichment.progress_apply(lambda x: get_rating("movie", x.loc["title_EN"], x.loc["year"]), axis=1)

In [243]:
shows_enrichment['imdb_rating'] = shows_enrichment.progress_apply(lambda x: get_rating("series", x.loc["title_EN"], x.loc["year"]), axis=1)

100%|██████████| 1000/1000 [01:00<00:00, 16.65it/s]


In [None]:

movies_enrichment['imdb_rating'] = movies_enrichment.progress_apply(lambda x: get_rating("movie", x.loc["title_EN"], x.loc["year"]), axis=1)

time.sleep(60)

shows_enrichment['imdb_rating'] = shows_enrichment.progress_apply(lambda x: get_rating("series", x.loc["title_EN"], x.loc["year"]), axis=1)

100%|██████████| 1000/1000 [01:00<00:00, 16.59it/s]
100%|██████████| 1000/1000 [04:05<00:00,  4.08it/s]


In [244]:
shows_enrichment

Unnamed: 0,id,title_EN,year,imdb_rating
0,1396,Breaking Bad,2008,
1,219246,When Life Gives You Tangerines,2025,
2,209867,Frieren: Beyond Journey's End,2023,
3,94605,Arcane,2021,
4,131378,Adventure Time: Fionna & Cake,2023,
...,...,...,...,...
995,1660,I Dream of Jeannie,1965,
996,78204,Sword Art Online Alternative: Gun Gale Online,2018,
997,54671,Penny Dreadful,2014,
998,35279,Austin & Ally,2011,


In [54]:
display(TMDB_movies.columns)

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'origin', 'type'],
      dtype='object')