# Libs and reading data

In [14]:
import pandas as pd
movies = pd.read_csv("movies-database/ml-25m/movies.csv")
links = pd.read_csv("movies-database/ml-25m/links.csv")

# Exploring movies.csv

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
len(movies)

62423

In [4]:
movies.nunique()

movieId    62423
title      62325
genres      1639
dtype: int64

In [15]:
# Show rows where both title and genres are duplicated
movies[movies.duplicated(subset=['title', 'genres'], keep=False)].sort_values(by='title')


Unnamed: 0,movieId,title,genres
20079,104035,Beneath (2013),Horror
22611,115777,Beneath (2013),Horror
55360,191775,Berlin Calling (2008),Comedy|Drama
13084,66511,Berlin Calling (2008),Comedy|Drama
20106,104155,Clear History (2013),Comedy
25081,122940,Clear History (2013),Comedy
54103,188823,Delirium (2018),Horror|Thriller
56748,194845,Delirium (2018),Horror|Thriller
50726,181655,Detour (2017),Thriller
44694,168774,Detour (2017),Thriller


In [6]:
links.nunique()

movieId    62423
imdbId     62423
tmdbId     62281
dtype: int64

In [16]:
movies_with_links = pd.merge(movies, links, on='movieId', how='left')

In [17]:
movies_with_links.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [18]:
movies_with_links['tmdbId'] = movies_with_links['tmdbId'].astype('Int64')
movies_with_links.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [21]:
movies_with_links[movies_with_links.duplicated(subset=['imdbId'], keep=False)].sort_values(by='tmdbId')


Unnamed: 0,movieId,title,genres,imdbId,tmdbId


In [20]:
movies_with_links[movies_with_links.duplicated(subset=['title', 'genres'], keep=False)].sort_values(by='title')

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
20079,104035,Beneath (2013),Horror,2325518,191619.0
22611,115777,Beneath (2013),Horror,2991296,257874.0
55360,191775,Berlin Calling (2008),Comedy|Drama,211946,30508.0
13084,66511,Berlin Calling (2008),Comedy|Drama,1213019,30508.0
20106,104155,Clear History (2013),Comedy,2380408,
25081,122940,Clear History (2013),Comedy,2279864,133790.0
54103,188823,Delirium (2018),Horror|Thriller,2069797,340601.0
56748,194845,Delirium (2018),Horror|Thriller,3131050,401732.0
50726,181655,Detour (2017),Thriller,5845946,464858.0
44694,168774,Detour (2017),Thriller,4372390,356500.0


In [31]:
movies_with_links['imdb_id_str'] = movies_with_links['imdbId'].apply(lambda x: f"tt{int(x):07d}")
from dotenv import load_dotenv
import os
import requests
load_dotenv()  

api_key = os.getenv("API_KEY")
auth_token = os.getenv("AUTH_TOKEN")

In [None]:


headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {auth_token}"
}

def get_tmdb_id(imdb_id_str):
    url = f"https://api.themoviedb.org/3/find/{imdb_id_str}?external_source=imdb_id"
    print(f"Processing IMDb ID: {imdb_id_str} ...", end=' ')
    try:
        response = requests.get(url, headers=headers)
        data = response.json()
        if data['movie_results']:
            tmdb_id = data['movie_results'][0]['id']
            print(f"✔ Found TMDB ID: {tmdb_id}")
            return tmdb_id
        else:
            print("✘ No movie results found.")
    except Exception as e:
        print(f"⚠️ Error: {e}")
    return None


# Apply to every row
movies_with_links['tmdbId'] = movies_with_links['imdb_id_str'].apply(get_tmdb_id)

# Ensure tmdbId column has nullable integer type
movies_with_links['tmdbId'] = movies_with_links['tmdbId'].astype('Int64')

# Show first few rows
print(movies_with_links.head())


Processing IMDb ID: tt0114709 ... ✔ Found TMDB ID: 862
Processing IMDb ID: tt0113497 ... ✔ Found TMDB ID: 8844
Processing IMDb ID: tt0113228 ... ✔ Found TMDB ID: 15602
Processing IMDb ID: tt0114885 ... ✔ Found TMDB ID: 31357
Processing IMDb ID: tt0113041 ... ✔ Found TMDB ID: 11862
Processing IMDb ID: tt0113277 ... ✔ Found TMDB ID: 949
Processing IMDb ID: tt0114319 ... ✔ Found TMDB ID: 11860
Processing IMDb ID: tt0112302 ... ✔ Found TMDB ID: 45325
Processing IMDb ID: tt0114576 ... ✔ Found TMDB ID: 9091
Processing IMDb ID: tt0113189 ... ✔ Found TMDB ID: 710
Processing IMDb ID: tt0112346 ... ✔ Found TMDB ID: 9087
Processing IMDb ID: tt0112896 ... ✔ Found TMDB ID: 12110
Processing IMDb ID: tt0112453 ... ✔ Found TMDB ID: 21032
Processing IMDb ID: tt0113987 ... ✔ Found TMDB ID: 10858
Processing IMDb ID: tt0112760 ... ✔ Found TMDB ID: 1408
Processing IMDb ID: tt0112641 ... ✔ Found TMDB ID: 524
Processing IMDb ID: tt0114388 ... ✔ Found TMDB ID: 4584
Processing IMDb ID: tt0113101 ... ✔ Found TM

In [41]:
movies_with_links['tmdbId_original']=links['tmdbId'].astype('Int64')
movies_with_links.head()


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdbId_str,imdb_id_str,tmdbId_original
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862,114709,tt0114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844,113497,tt0113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602,113228,tt0113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357,114885,tt0114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862,113041,tt0113041,11862


In [42]:
movies_with_links[movies_with_links['tmdbId'] != movies_with_links['tmdbId_original']]

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdbId_str,imdb_id_str,tmdbId_original
708,723,Two Friends (1986),Drama,92123,123067,92123,tt0092123,131232
845,864,"Wife, The (1995)",Comedy|Drama,114936,83720,114936,tt0114936,132641
874,895,Venice/Venice (1992),Drama,105729,293794,105729,tt0105729,79782
1087,1115,Sleepover (1995),Drama,114472,277102,114472,tt0114472,141210
1480,1533,"Promise, The (La promesse) (1996)",Drama,117398,24183,117398,tt0117398,105045
...,...,...,...,...,...,...,...,...
61549,206198,The Night Bulletin,Horror|Thriller,8997180,621737,8997180,tt8997180,574905
61629,206409,Royal Shakespeare Company: The Tempest,(no genres listed),6848114,435588,6848114,tt6848114,611478
61732,206757,Twice,(no genres listed),1931497,495811,1931497,tt1931497,577625
61924,207281,Classic Albums: Elvis Presley - Elvis Presley ...,Documentary,923636,790073,923636,tt0923636,92246


In [45]:
movies_with_links[movies_with_links['tmdbId'].isna()][['imdb_id_str', 'tmdbId','tmdbId_original',"title"]].sort_values(by='tmdbId')



Unnamed: 0,imdb_id_str,tmdbId,tmdbId_original,title
596,tt0115978,,538286,Criminals (1996)
705,tt0118114,,503475,Wallace & Gromit: The Best of Aardman Animatio...
715,tt0125877,,,Low Life (1994)
754,tt0038426,,,Costa Brava (1946)
1080,tt0102336,,,Loser (1991)
...,...,...,...,...
61826,tt11043546,,630656,The Death and Return of Superman (2019)
61853,tt5028698,,360086,Walt Disney (2015)
61934,tt3680476,,210590,Pingu's The Thing (2012)
61951,tt10987544,,631997,The Forest of Love (2019)


In [2]:
import pandas as pd

movies_with_links = pd.read_csv("movies_with_links.csv")

movies_with_links.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdbId_str,imdb_id_str
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,tt0114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,113497,tt0113497
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,113228,tt0113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,114885,tt0114885
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,113041,tt0113041


In [3]:
movies_with_links[movies_with_links['tmdbId'].isna()].sort_values(by='tmdbId')

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdbId_str,imdb_id_str
596,604,Criminals (1996),Documentary,115978,,115978,tt0115978
705,720,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,118114,,118114,tt0118114
715,730,Low Life (1994),Drama,125877,,125877,tt0125877
754,770,Costa Brava (1946),Drama,38426,,38426,tt0038426
1080,1107,Loser (1991),Comedy,102336,,102336,tt0102336
...,...,...,...,...,...,...,...
61826,207027,The Death and Return of Superman (2019),Action|Adventure|Animation,11043546,,11043546,tt11043546
61853,207093,Walt Disney (2015),Documentary,5028698,,5028698,tt5028698
61934,207307,Pingu's The Thing (2012),Animation|Comedy|Horror|Sci-Fi,3680476,,3680476,tt3680476
61951,207343,The Forest of Love (2019),Drama|Horror|Mystery|Thriller,10987544,,10987544,tt10987544


In [None]:
movies = pd.read_csv("movies-database/ml-25m/movies.csv")
links = pd.read_csv("movies-database/ml-25m/links.csv")



movies = pd.merge(movies, links, on='movieId', how='left')
movies_with_links[movies_with_links['tmdbId'].isna()].sort_values(by='tmdbId')

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0
...,...,...,...,...,...
62418,209157,We (2018),Drama,6671244,499546.0
62419,209159,Window of the Soul (2001),Documentary,297986,63407.0
62420,209163,Bad Poems (2018),Comedy|Drama,6755366,553036.0
62421,209169,A Girl Thing (2001),(no genres listed),249603,162892.0


In [12]:
# Assume 'links' is the original DataFrame with complete tmdbId info

# Filter movies_with_links where tmdbId is missing
missing_in_mwl = movies_with_links[movies_with_links['tmdbId'].isna()]

# Filter links where tmdbId is not missing
non_missing_in_links = links[links['tmdbId'].notna()]

# Merge on movieId to find those with missing in one but present in the other
restored_tmdb = pd.merge(missing_in_mwl[['movieId','title']], non_missing_in_links, on='movieId')

restored_tmdb


Unnamed: 0,movieId,title,imdbId,tmdbId
0,604,Criminals (1996),115978,538286.0
1,720,Wallace & Gromit: The Best of Aardman Animatio...,118114,503475.0
2,2510,Just the Ticket (1999),134948,26425.0
3,2999,Man of the Century (1999),154827,98480.0
4,4241,Pokémon 3: The Movie (2001),266860,10991.0
...,...,...,...,...
623,207027,The Death and Return of Superman (2019),11043546,630656.0
624,207093,Walt Disney (2015),5028698,360086.0
625,207307,Pingu's The Thing (2012),3680476,210590.0
626,207343,The Forest of Love (2019),10987544,631997.0


In [13]:
merged = movies_with_links.merge(
    links[['movieId', 'tmdbId']], 
    on='movieId', 
    how='left', 
    suffixes=('', '_correct')
)
merged['tmdbId'] = merged['tmdbId'].fillna(merged['tmdbId_correct'])
merged.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdbId_str,imdb_id_str,tmdbId_correct
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,114709,tt0114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,113497,tt0113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,113228,tt0113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,114885,tt0114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,113041,tt0113041,11862.0


In [4]:
from dotenv import load_dotenv
import os
import requests
load_dotenv()  

api_key = os.getenv("API_KEY")
auth_token = os.getenv("AUTH_TOKEN")

In [None]:
headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {auth_token}"
}

def get_tmdb_id(imdb_id_str):
    url = f"https://api.themoviedb.org/3/find/{imdb_id_str}?external_source=imdb_id"
    print(f"Processing IMDb ID: {imdb_id_str} ...", end=' ')
    try:
        response = requests.get(url, headers=headers)
        data = response.json()
        if data['movie_results']:
            tmdb_id = data['movie_results'][0]['id']
            print(f"✔ Found TMDB ID: {tmdb_id}")
            return tmdb_id
        else:
            print("✘ No movie results found.")
    except Exception as e:
        print(f"⚠️ Error: {e}")
    return None


# Apply to every row
movies_with_links['tmdbId'] = movies_with_links['imdb_id_str'].apply(get_tmdb_id)

# Ensure tmdbId column has nullable integer type
movies_with_links['tmdbId'] = movies_with_links['tmdbId'].astype('Int64')

# Show first few rows
print(movies_with_links.head())
