<a href="https://colab.research.google.com/github/DenisDorokhov1/Course_paper/blob/main/Adding_more_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import time
import os

TMDB_API_KEY = "54d65fae01d4e4ceb8ab3f026107fd39"
BASE_URL = "https://api.themoviedb.org/3"

def get_movie_details_by_id(movie_id, api_key=TMDB_API_KEY):
    """
    Gets movie details (budget and fees) by movie ID from TMDB.
    Returns a dictionary with 'budget' and 'revenue' or None in case of an error or missing data.
    """
    if pd.isna(movie_id):
        return None

    endpoint = f"{BASE_URL}/movie/{int(movie_id)}"
    params = {
        "api_key": api_key,
        "language": "en-US"
    }

    try:
        response = requests.get(endpoint, params=params, timeout=10)
        response.raise_for_status()
        details = response.json()


        budget = details.get("budget", 0)
        revenue = details.get("revenue", 0)

        return {"budget": budget, "revenue": revenue}
    except requests.exceptions.RequestException as e:
        print("{movie_id}: {e}")
        return None


if __name__ == "__main__":
    csv_file_path = "TMDB_top_rated_movies.csv"

    df = pd.read_csv(csv_file_path)
    print("Исходные колонки:", df.columns.tolist())

    if 'budget' not in df.columns:
        df['budget'] = pd.NA
    if 'revenue' not in df.columns:
        df['revenue'] = pd.NA

    fetched_data = []

    for index, row in df.iterrows():
        movie_id = row['id']
        movie_title = row.get('title', 'N/A')

        if pd.notna(row['budget']) and row['budget'] != 0 and \
            pd.notna(row['revenue']) and row['revenue'] != 0:
            print(f"{movie_title} (ID: {movie_id})")
            fetched_data.append({'budget': row['budget'], 'revenue': row['revenue']})
            continue


        details = get_movie_details_by_id(movie_id)

        if details:
            fetched_data.append(details)
            print(f"budget={details['budget']}, revenue={details['revenue']}")
        else:
            fetched_data.append({'budget': 0, 'revenue': 0})
            print(f"No detail for'{movie_title}' (ID: {movie_id})")

        time.sleep(0.1)

    new_df_data = pd.DataFrame(fetched_data)

    df['budget'] = new_df_data['budget']
    df['revenue'] = new_df_data['revenue']


    print(df[['title', 'id', 'budget', 'revenue']].head())

    output_csv_file_path = "TMDB_top_rated_movies_with_financials_by_id.csv"
    df.to_csv(output_csv_file_path, index=False)

In [None]:
import pandas as pd
import requests
import time

TMDB_API_KEY = "54d65fae01d4e4ceb8ab3f026107fd39"
BASE_URL = "https://api.themoviedb.org/3"


def get_movie_details(movie_id):
    """ Getting runtime, genre and release data by ID"""
    url = f"{BASE_URL}/movie/{int(movie_id)}"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        genres = [genre['name'] for genre in data.get("genres", [])]
        return {
            "runtime": data.get("runtime", None),
            "genres": ", ".join(genres),
            "release_year": data.get("release_date", "")[:4]
        }
    except Exception as e:
        print(f"Error for the film with ID={movie_id}: {e}")
        return {"runtime": None, "genres": None, "release_year": None, "original_language": None}


if __name__ == "__main__":
    input_path = "dataset_with_inverted_emotions.csv"
    output_path = "movies_with_more_tmdb_data.csv"

    df = pd.read_csv(input_path)


    df["runtime"] = pd.NA
    df["genres"] = pd.NA
    df["release_year"] = pd.NA
    df["original_language"] = pd.NA
    df["director_name"] = pd.NA

    for index, row in df.iterrows():
        movie_id = row['id']
        print(f"\n🔍 Обрабатывается {index + 1}/{len(df)}: ID={movie_id}")


        details = get_movie_details(movie_id)

        df.at[index, "runtime"] = details["runtime"]
        df.at[index, "genres"] = details["genres"]
        df.at[index, "release_year"] = details["release_year"]

        time.sleep(0.15)

    df.to_csv(output_path, index=False)
