# From IMBb dataset into 25ml MovieLens dataset

Load 25min context csv file 

In [2]:
import pandas as pd
import json

In [3]:
# depth of this file in the project
file_depth = '../..'
imdb_dataset = '/dataset/imdb/imdb_original/'

with open(file_depth + '/config/data_25m_config.json') as config_file:
    config = json.load(config_file)

In [33]:
original_csv_movies_data = config['original_csv_movies_data']
movielens_movies = pd.read_csv(file_depth + original_csv_movies_data, encoding="UTF-8")
movielens_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


Create movieYear column from title column

In [35]:
movielens_movies['movieYear'] = movielens_movies['title'].str.extract(r'\((\d{4})\)')
movielens_movies

Unnamed: 0,movieId,title,genres,movieYear
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
62418,209157,We (2018),Drama,2018
62419,209159,Window of the Soul (2001),Documentary,2001
62420,209163,Bad Poems (2018),Comedy|Drama,2018
62421,209169,A Girl Thing (2001),(no genres listed),2001


In [36]:
original_csv_links_data = config['original_csv_links_data']
movielens_links = pd.read_csv(file_depth + original_csv_links_data, encoding="UTF-8", dtype={'imdbId': str, 'tmdbId': str})
movielens_links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,0114709,862
1,2,0113497,8844
2,3,0113228,15602
3,4,0114885,31357
4,5,0113041,11862
...,...,...,...
62418,209157,6671244,499546
62419,209159,0297986,63407
62420,209163,6755366,553036
62421,209169,0249603,162892


Merge then based on movieId to get corresponding imdbId

In [37]:
movielens_movies = pd.merge(movielens_movies, movielens_links[['movieId', 'imdbId']], on='movieId', how='left')
movielens_movies

Unnamed: 0,movieId,title,genres,movieYear,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,0113497
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,0113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,0114885
4,5,Father of the Bride Part II (1995),Comedy,1995,0113041
...,...,...,...,...,...
62418,209157,We (2018),Drama,2018,6671244
62419,209159,Window of the Soul (2001),Documentary,2001,0297986
62420,209163,Bad Poems (2018),Comedy|Drama,2018,6755366
62421,209169,A Girl Thing (2001),(no genres listed),2001,0249603


Add 'tt' to imdbId to get the right format if imdbId

In [38]:
movielens_movies['imdbId'] = 'tt' + movielens_movies['imdbId'].astype(str)
movielens_movies

Unnamed: 0,movieId,title,genres,movieYear,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,tt0114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,tt0113497
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,tt0113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,tt0114885
4,5,Father of the Bride Part II (1995),Comedy,1995,tt0113041
...,...,...,...,...,...
62418,209157,We (2018),Drama,2018,tt6671244
62419,209159,Window of the Soul (2001),Documentary,2001,tt0297986
62420,209163,Bad Poems (2018),Comedy|Drama,2018,tt6755366
62421,209169,A Girl Thing (2001),(no genres listed),2001,tt0249603


In [39]:
imdb_basics = pd.read_csv(file_depth + imdb_dataset + 'title.basics.tsv',sep='\t', dtype={'isAdult': str, 'startYear': str, 'runtimeMinutes': str, 'endYear': str})
imdb_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10570353,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10570354,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10570355,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10570356,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


Merge movies data from 25ml MovieLens dataset with movie/series Imdb dataset based on imdbId, remove columns tconst, originalTitle and endYear

In [40]:
merged_movies = pd.merge(movielens_movies, imdb_basics, left_on='imdbId', right_on='tconst', how='inner')
merged_movies = merged_movies.drop(['tconst', 'originalTitle', 'endYear'], axis=1)
merged_movies = merged_movies.rename(columns={ 'genres_x': 'genresMovieLens','genres_y': 'genresImdb'})
merged_movies

Unnamed: 0,movieId,title,genresMovieLens,movieYear,imdbId,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genresImdb
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,tt0114709,movie,Toy Story,0,1995,81,"Adventure,Animation,Comedy"
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,tt0113497,movie,Jumanji,0,1995,104,"Adventure,Comedy,Family"
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,tt0113228,movie,Grumpier Old Men,0,1995,101,"Comedy,Romance"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,tt0114885,movie,Waiting to Exhale,0,1995,124,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),Comedy,1995,tt0113041,movie,Father of the Bride Part II,0,1995,106,"Comedy,Family,Romance"
...,...,...,...,...,...,...,...,...,...,...,...
62307,209157,We (2018),Drama,2018,tt6671244,movie,Wij,0,2018,100,Drama
62308,209159,Window of the Soul (2001),Documentary,2001,tt0297986,movie,Window of the Soul,0,2001,73,Documentary
62309,209163,Bad Poems (2018),Comedy|Drama,2018,tt6755366,movie,Bad Poems,0,2018,97,"Comedy,Drama"
62310,209169,A Girl Thing (2001),(no genres listed),2001,tt0249603,tvMovie,A Girl Thing,0,2001,237,"Comedy,Drama,Romance"


In [41]:
imdb_crew = pd.read_csv(file_depth + imdb_dataset + 'title.crew.tsv',sep='\t')
imdb_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
10570353,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10570354,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10570355,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10570356,tt9916856,nm10538645,nm6951431


Merge movies data with directors and writers information from Imdb dataset based on imdbId, remove columns tconst.

In [42]:
merged_movies_with_crew = pd.merge(merged_movies, imdb_crew, left_on='imdbId', right_on='tconst', how='inner')
merged_movies_with_crew = merged_movies_with_crew.drop(['tconst'], axis=1)
merged_movies_with_crew

Unnamed: 0,movieId,title,genresMovieLens,movieYear,imdbId,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genresImdb,directors,writers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,tt0114709,movie,Toy Story,0,1995,81,"Adventure,Animation,Comedy",nm0005124,"nm0005124,nm0230032,nm0004056,nm0710020,nm0923..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,tt0113497,movie,Jumanji,0,1995,104,"Adventure,Comedy,Family",nm0002653,"nm0378144,nm0852430,nm0833164,nm0885575"
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,tt0113228,movie,Grumpier Old Men,0,1995,101,"Comedy,Romance",nm0222043,nm0425756
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,tt0114885,movie,Waiting to Exhale,0,1995,124,"Comedy,Drama,Romance",nm0001845,"nm0573334,nm0060103"
4,5,Father of the Bride Part II (1995),Comedy,1995,tt0113041,movie,Father of the Bride Part II,0,1995,106,"Comedy,Family,Romance",nm0796124,"nm0352443,nm0329304,nm0583600,nm0796124"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62307,209157,We (2018),Drama,2018,tt6671244,movie,Wij,0,2018,100,Drama,nm1415482,"nm1415482,nm1818234"
62308,209159,Window of the Soul (2001),Documentary,2001,tt0297986,movie,Window of the Soul,0,2001,73,Documentary,"nm0142504,nm1065588","nm0142504,nm1065588"
62309,209163,Bad Poems (2018),Comedy|Drama,2018,tt6755366,movie,Bad Poems,0,2018,97,"Comedy,Drama",nm2520391,nm2520391
62310,209169,A Girl Thing (2001),(no genres listed),2001,tt0249603,tvMovie,A Girl Thing,0,2001,237,"Comedy,Drama,Romance",nm0003022,nm0003022


--------------------

Load name.basics and keep only rows where primaryProfession contains 'actor' or 'actress'

In [25]:
imdb_names = pd.read_csv(file_depth + imdb_dataset + 'name.basics.tsv',sep='\t')
imdb_names = imdb_names[imdb_names['primaryProfession'].str.contains('actor|actress', na=False)]
imdb_names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0027125,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0054452,tt0056404,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0083922,tt0050986,tt0050976"
...,...,...,...,...,...,...
13275882,nm9993698,Sebi John,\N,\N,actor,tt8736744
13275883,nm9993699,Dani Jacob,\N,\N,actor,tt8736744
13275884,nm9993700,Sexy Angel,\N,\N,actress,tt7523066
13275885,nm9993701,Sanjai Kuriakose,\N,\N,actor,tt8736744


Filter row where imdbId in knownForTitles is in the list of imdbId from the 25ml context csv. On condition that at least one imdbId for 'knownForTitles' is in the list of imdbId from the 25ml context csv.

In [29]:
def has_matching_imdb_id(known_for_titles):
    titles_list = str(known_for_titles).split(',')
    return any(title in merged_movies_with_crew['imdbId'].values for title in titles_list)

filtered_imdb_names_for_actor = imdb_names[imdb_names['knownForTitles'].apply(has_matching_imdb_id)]

In [49]:
filtered_imdb_names_for_actor

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0027125,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0054452,tt0056404,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0078723,tt0072562,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0083922,tt0050986,tt0050976"
...,...,...,...,...,...,...
13275673,nm9993462,Joseph Isaac Rowan,\N,\N,actor,"tt5162668,tt6527426"
13275674,nm9993463,Matthew Tidmore,\N,\N,actor,tt6527426
13275675,nm9993464,Babu Ravi,\N,\N,actor,tt6527426
13275760,nm9993558,Ádám Szabó,\N,\N,"stunts,actor","tt5180504,tt29538571,tt4978420,tt7984766"


In [30]:
new_context_path = config['new_context_path']
filtered_imdb_names_for_actor.to_csv(file_depth + new_context_path + 'filtered_imdb_names_for_actor_any_match.csv', index=False)

For every movie get the list of (mostly) actors and actresses from the name.basics Imdb dataset.

In [46]:
merged_movies_with_crew['actor'] = ''

for index, row in filtered_imdb_names_for_actor.iterrows():

    imdb_ids = str(row['knownForTitles']).split(',')

    for imdb_id in imdb_ids:
        # check if imdbId is not already written
        if row['nconst'] not in merged_movies_with_crew.loc[merged_movies_with_crew['imdbId'] == imdb_id, 'actor'].values:
            # if not, append it
            merged_movies_with_crew.loc[merged_movies_with_crew['imdbId'] == imdb_id, 'actor'] += f"{row['nconst']},"

# remove ',' from last position
merged_movies_with_crew['actor'] = merged_movies_with_crew['actor'].str.rstrip(',')


In [None]:
print('Number of rows with empty actor column : ', merged_movies_with_crew['actor'].isna().sum())
merged_movies_with_crew

Number of rows with empty actor column :  0


Unnamed: 0,movieId,title,genresMovieLens,movieYear,imdbId,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genresImdb,directors,writers,actor
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,tt0114709,movie,Toy Story,0,1995,81,"Adventure,Animation,Comedy",nm0005124,"nm0005124,nm0230032,nm0004056,nm0710020,nm0923...","nm0000741,nm0001652,nm0001815,nm0029460,nm0039..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,tt0113497,movie,Jumanji,0,1995,104,"Adventure,Comedy,Family",nm0002653,"nm0378144,nm0852430,nm0833164,nm0885575","nm0001372,nm0001564,nm0002123,nm0003742,nm0051..."
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,tt0113228,movie,Grumpier Old Men,0,1995,101,"Comedy,Romance",nm0222043,nm0425756,"nm0025908,nm0132587,nm0153170,nm0160648,nm0263..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,tt0114885,movie,Waiting to Exhale,0,1995,124,"Comedy,Drama,Romance",nm0001845,"nm0573334,nm0060103","nm0001365,nm0005375,nm0119698,nm0171213,nm0176..."
4,5,Father of the Bride Part II (1995),Comedy,1995,tt0113041,movie,Father of the Bride Part II,0,1995,106,"Comedy,Family,Romance",nm0796124,"nm0352443,nm0329304,nm0583600,nm0796124","nm0003028,nm0015233,nm0030926,nm0042963,nm0056..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62307,209157,We (2018),Drama,2018,tt6671244,movie,Wij,0,2018,100,Drama,nm1415482,"nm1415482,nm1818234","nm10312234,nm11039405,nm1963018,nm3760035,nm42..."
62308,209159,Window of the Soul (2001),Documentary,2001,tt0297986,movie,Window of the Soul,0,2001,73,Documentary,"nm0142504,nm1065588","nm0142504,nm1065588",nm1546270
62309,209163,Bad Poems (2018),Comedy|Drama,2018,tt6755366,movie,Bad Poems,0,2018,97,"Comedy,Drama",nm2520391,nm2520391,"nm0468514,nm0598406,nm10676258,nm1387171,nm158..."
62310,209169,A Girl Thing (2001),(no genres listed),2001,tt0249603,tvMovie,A Girl Thing,0,2001,237,"Comedy,Drama,Romance",nm0003022,nm0003022,"nm0121700,nm0453858,nm0841512"


In [None]:
new_context_path = config['new_context_path']
merged_movies_with_crew.to_csv(file_depth + new_context_path + 'merged_movies_with_context.csv', index=False)