In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
movies_id = pd.read_csv("../data/movie_ids.csv")

In [3]:
movies_id.shape

(916746, 6)

In [4]:
len(set(movies_id["id"].to_list()))

916746

In [87]:
data = pd.read_csv("../data/async_movie_db_300k_700k.csv")

In [88]:
data.shape

(401602, 28)

In [89]:
na_indices = data[data["id"].isna() | data["title"].isna()].index

In [90]:
na_indices[0:5], len(na_indices)

(Index([1973, 1974, 2724, 2725, 2726], dtype='int64'), 71893)

In [91]:
clean_data = data.drop(axis=1, index=na_indices)

In [92]:
clean_data = clean_data.drop_duplicates(subset=["id", "overview"])

In [93]:
clean_data.shape, clean_data.columns

((329708, 28),
 Index(['Unnamed: 0', 'adult', 'backdrop_path', 'belongs_to_collection',
        'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language',
        'original_title', 'overview', 'popularity', 'poster_path',
        'production_companies', 'production_countries', 'release_date',
        'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
        'video', 'vote_average', 'vote_count', 'keywords', 'credits'],
       dtype='object'))

In [94]:
main_data = clean_data[["id", "title", "overview", "credits", "keywords", "genres"]]

In [95]:
main_data.head()

Unnamed: 0,id,title,overview,credits,keywords,genres
0,442113.0,I'm Okay,An anti-romantic comedy where we watch a coupl...,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},[]
2,442114.0,Smoke,,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},[]
3,442115.0,What's Happening at Local 70,Striking workers in one Chicago unemployment c...,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},"[{'id': 99, 'name': 'Documentary'}]"
4,442116.0,Big Boss,Conflicts between power-hungry triad gangsters...,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",{'keywords': []},"[{'id': 28, 'name': 'Action'}]"
5,442117.0,Pulse,A gay disabled teenage boy changes into the bo...,"{'cast': [{'adult': False, 'gender': 2, 'id': ...","{'keywords': [{'id': 158718, 'name': 'lgbt'}, ...","[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name..."


In [96]:
main_data["keywords"].fillna('{"keywords":[]}', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  main_data["keywords"].fillna('{"keywords":[]}', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["keywords"].fillna('{"keywords":[]}', inplace=True)


In [97]:
main_data["credits"].fillna('{"cast":[], "crew":[]}', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  main_data["credits"].fillna('{"cast":[], "crew":[]}', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["credits"].fillna('{"cast":[], "crew":[]}', inplace=True)


In [98]:
main_data["genres"].fillna('[]', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  main_data["genres"].fillna('[]', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["genres"].fillna('[]', inplace=True)


In [99]:
columns_to_change = ["keywords", "credits", "genres"]
for c in columns_to_change:
    print(c)
    main_data[c] = main_data[c].apply(literal_eval)

keywords


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data[c] = main_data[c].apply(literal_eval)


credits
genres


In [100]:
main_data.head()

Unnamed: 0,id,title,overview,credits,keywords,genres
0,442113.0,I'm Okay,An anti-romantic comedy where we watch a coupl...,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},[]
2,442114.0,Smoke,,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},[]
3,442115.0,What's Happening at Local 70,Striking workers in one Chicago unemployment c...,"{'cast': [], 'crew': [{'adult': False, 'gender...",{'keywords': []},"[{'id': 99, 'name': 'Documentary'}]"
4,442116.0,Big Boss,Conflicts between power-hungry triad gangsters...,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",{'keywords': []},"[{'id': 28, 'name': 'Action'}]"
5,442117.0,Pulse,A gay disabled teenage boy changes into the bo...,"{'cast': [{'adult': False, 'gender': 2, 'id': ...","{'keywords': [{'id': 158718, 'name': 'lgbt'}, ...","[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name..."


In [71]:
def extract_keywords(k):
    
    if "keywords" not in k or len(k["keywords"]) == 0:
        return None

    return [x["name"] for x in k["keywords"]]

In [72]:
def extract_genres(g):
    if not g:
        return None
    return [x["name"] for x in g] 

In [45]:
def extract_directors(c):
    if "Director" not in c:
        return None
    directors = sorted([(x["popularity"], x["name"]) for x in c["crew"] if x["job"] == "Director"], reverse=True)
    return [d[1] for d in directors] if directors else None
    

In [75]:
def extract_cast(c):
    if "cast" not in c:
        return None
    cast= sorted([(x["popularity"], x["name"]) for x in c["cast"] if "name" in x], reverse=True)
    return [ca[1] for ca in cast] if cast else None

In [101]:
main_data["keywords"] = main_data["keywords"].apply(extract_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["keywords"] = main_data["keywords"].apply(extract_keywords)


In [104]:
main_data["cast"] = main_data["credits"].apply(extract_cast)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["cast"] = main_data["credits"].apply(extract_cast)


In [105]:
main_data["director"] = main_data["credits"].apply(extract_directors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["director"] = main_data["credits"].apply(extract_directors)


In [106]:
main_data["genres"] = main_data["genres"].apply(extract_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data["genres"] = main_data["genres"].apply(extract_genres)


In [108]:
main_data = main_data.drop("credits", axis=1)

In [111]:
main_data.head(10)

Unnamed: 0,id,title,overview,keywords,genres,cast,director
0,442113.0,I'm Okay,An anti-romantic comedy where we watch a coupl...,,,,
2,442114.0,Smoke,,,,,
3,442115.0,What's Happening at Local 70,Striking workers in one Chicago unemployment c...,,[Documentary],,
4,442116.0,Big Boss,Conflicts between power-hungry triad gangsters...,,[Action],"[Ku Feng, Chan Sing, Danny Lee Sau-Yin, Chen H...",
5,442117.0,Pulse,A gay disabled teenage boy changes into the bo...,"[lgbt, gay theme]","[Drama, Fantasy, Science Fiction]","[Josephine Langford, Caroline Brazier, Lee Jan...",
6,442119.0,Paradise for Two,A chorus girl is mistaken for a millionaire's ...,,[Comedy],"[Roland Culver, Finlay Currie, Martita Hunt, G...",
7,442121.0,Sowbhagyam,"Sowbhagyam is a 1993 Indian Malayalam film, di...",,,"[Jagathy Sreekumar, Jagadish, Rajan P Dev, Kut...",
8,442122.0,Noa Noa,The Survivors of the Bounty,"[mutiny, beach]",[Drama],"[Ines Pellegrini, Paolo Malco, Isabelle De Val...",
9,442123.0,Vai col liscio,Federico Altarini is a Dancer of Southern Ital...,[dancing],[Comedy],"[Valeria Fabrizi, Janet Ågren, Maurizio Arena,...",
10,442124.0,Yana+Yanko,Yana Titova is a very successful businesswoman...,"[orphanage, editor-in-chief, fashion magazine,...","[Family, Comedy]","[Tinatin Dalakishvili, Artur Smolyaninov, Maks...",


In [112]:
final_df = pd.concat([main_data, clean_data["popularity"]], axis=1)

In [113]:
final_df.shape

(329708, 8)

In [114]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 329708 entries, 0 to 401601
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          329708 non-null  float64
 1   title       329708 non-null  object 
 2   overview    264435 non-null  object 
 3   keywords    67679 non-null   object 
 4   genres      222029 non-null  object 
 5   cast        205257 non-null  object 
 6   director    0 non-null       object 
 7   popularity  329708 non-null  object 
dtypes: float64(1), object(7)
memory usage: 22.6+ MB


In [115]:
final_df.to_csv("../data/clean_movies_300_700K.csv")