In [1]:
#Imports
import pandas as pd
import ast

In [2]:
#Global Variables
DATA_PATH = '../data/'
CREDITS_FILE = DATA_PATH + 'credits.csv'

CLEAN_DATA_PATH = '../clean_data/'
ACTORS_FILE = CLEAN_DATA_PATH + 'actors.csv'
CREW_FILE = CLEAN_DATA_PATH + 'crew.csv'
DIRECTORS_FILE = CLEAN_DATA_PATH + 'directors.csv'
PEOPLE_FILE = CLEAN_DATA_PATH + 'people.csv'

# EXTRACTING ALL ACTORS AND CREW FROM CREDITS FILE

In [3]:
#creating df for credits file
credits_df = pd.read_csv(CREDITS_FILE)

In [4]:
#df info
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [5]:
#Cleaning df
credits_df = credits_df.dropna()
credits_df = credits_df.drop_duplicates(subset=['id'])

In [6]:
#df info
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45432 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45432 non-null  object
 1   crew    45432 non-null  object
 2   id      45432 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [7]:
#Extracting actors and crew from credtis file
actors_rows = []
crew_rows = []
directors_rows = [] #Directors will be stored seprately from other crew members, since there are relevant to movies
people_row = []     #To store all unique persons

for index, row in credits_df.iterrows():
    movie_id = row['id']
    actors = row['cast']
    crew = row['crew']
    
    #Extracting actors
    if isinstance(actors, str) and actors.strip():
        try:
            actors_list = ast.literal_eval(actors)
        except Exception:
            actors_list = []
        
        for actor in actors_list:
            if isinstance(actor, dict):
                actor_id = actor.get('id')
                actor_name = actor.get('name')
                character = actor.get('character')
                if actor_id and actor_name.strip():
                    people_row.append({'person_id': actor_id, 'person_name': actor_name})
                    actors_rows.append({'movie_id': movie_id, 'person_id': actor_id, 'person_name': actor_name, 'character': character})
            else:
                print(f"Skipping invalid actor data for movie_id {movie_id}")
                
    #Extracting crew
    if isinstance(crew, str) and crew.strip():
        try:
            crew_list = ast.literal_eval(crew)
        except Exception:
            crew_list = []
        
        for person in crew_list:
            if isinstance(person, dict):
                person_id = person.get('id')
                person_name = person.get('name')
                person_job = person.get('job')
                person_department = person.get('department')
                if person_id and person_name.strip():
                    people_row.append({'person_id': person_id, 'person_name': person_name})
                    if person_job == 'Director':
                        directors_rows.append({'movie_id': movie_id, 'person_id': person_id, 'person_name': person_name})
                    else:
                        crew_rows.append({'movie_id': movie_id, 'person_id': person_id, 'person_name': person_name, 'job': person_job, 'department': person_department})
            else:
                print(f"Skipping invalid crew data for movie_id {movie_id}")


#Saving data in dataframes
actors_df = pd.DataFrame(actors_rows)
crew_df = pd.DataFrame(crew_rows)
directors_df = pd.DataFrame(directors_rows)
people_df = pd.DataFrame(people_row).drop_duplicates()

In [8]:
#df_info
people_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 353358 entries, 0 to 1025879
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   person_id    353358 non-null  int64 
 1   person_name  353358 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.1+ MB


In [9]:
#df info
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562044 entries, 0 to 562043
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     562044 non-null  int64 
 1   person_id    562044 non-null  int64 
 2   person_name  562044 non-null  object
 3   character    562044 non-null  object
dtypes: int64(2), object(2)
memory usage: 17.2+ MB


In [10]:
#df info
crew_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414837 entries, 0 to 414836
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     414837 non-null  int64 
 1   person_id    414837 non-null  int64 
 2   person_name  414837 non-null  object
 3   job          414837 non-null  object
 4   department   414837 non-null  object
dtypes: int64(2), object(3)
memory usage: 15.8+ MB


In [11]:
#df info
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48999 entries, 0 to 48998
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movie_id     48999 non-null  int64 
 1   person_id    48999 non-null  int64 
 2   person_name  48999 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [12]:
#Cleaning and droping duplicates
actors_df = actors_df.dropna(subset=['person_id', 'person_name'])
crew_df = crew_df.dropna(subset=['person_id', 'person_name'])
directors_df = directors_df.dropna(subset=['person_id', 'person_name'])

actors_df = actors_df.drop_duplicates()
crew_df = crew_df.drop_duplicates()
directors_df = directors_df.drop_duplicates()

In [13]:
#df info
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562040 entries, 0 to 562043
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     562040 non-null  int64 
 1   person_id    562040 non-null  int64 
 2   person_name  562040 non-null  object
 3   character    562040 non-null  object
dtypes: int64(2), object(2)
memory usage: 21.4+ MB


In [14]:
#df info
crew_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414837 entries, 0 to 414836
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     414837 non-null  int64 
 1   person_id    414837 non-null  int64 
 2   person_name  414837 non-null  object
 3   job          414837 non-null  object
 4   department   414837 non-null  object
dtypes: int64(2), object(3)
memory usage: 15.8+ MB


In [15]:
#df info
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48999 entries, 0 to 48998
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movie_id     48999 non-null  int64 
 1   person_id    48999 non-null  int64 
 2   person_name  48999 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [16]:
#Saving cleaned data to csv files
actors_df.to_csv(ACTORS_FILE, index=False)
crew_df.to_csv(CREW_FILE, index=False)
directors_df.to_csv(DIRECTORS_FILE, index=False)