In [None]:
import os
import pandas as pd
import re
import math
import copy
import numpy as np
import sys
# Add the root directory /workspaces/llm_etl to sys.path
sys.path.append(os.path.abspath(os.path.join('..', '..')))
# Now import your module
from spider2_utils import load_csv_database

-setup-

In [None]:
import pandas as pd
_database = load_csv_database("Db-IMDB", rows_limit=-1)

Person = _database["Person"]
M_Cast = _database["M_Cast"]
M_Director = _database["M_Director"]

### Question
I need you to look into the actor collaborations and tell me how many actors have made more films with Yash Chopra than with any other director. This will help us understand his influence on the industry better.

### User Intent 1: Get the PID of 'Yash Chopra'

In [None]:

yash_chopra_pid_df = Person[Person['Name'].str.strip() == 'Yash Chopra'][['PID']].copy()
yash_chopra_pid_df['PID'] = yash_chopra_pid_df['PID'].str.strip()


### User Intent 2: Count number of movies by each actor-director pair

In [None]:
M_Cast['MID'] = M_Cast['MID'].str.strip()
M_Cast['PID'] = M_Cast['PID'].str.strip()
M_Director['MID'] = M_Director['MID'].str.strip()
M_Director['PID'] = M_Director['PID'].str.strip()

merged_cast_director = pd.merge(M_Cast, M_Director, on='MID', suffixes=('_actor', '_director'))
num_of_mov_by_actor_director = (
    merged_cast_director.groupby(['PID_actor', 'PID_director'])['MID']
    .nunique()
    .reset_index(name='NUM_OF_MOV')
    .rename(columns={'PID_actor': 'ACTOR_PID', 'PID_director': 'DIRECTOR_PID'})
)


### User Intent 3: Filter movies directed by Yash Chopra

In [None]:
num_of_movies_by_yc = pd.merge(
    num_of_mov_by_actor_director,
    yash_chopra_pid_df,
    left_on='DIRECTOR_PID',
    right_on='PID'
)[['ACTOR_PID', 'DIRECTOR_PID', 'NUM_OF_MOV']].rename(columns={'NUM_OF_MOV': 'NUM_OF_MOV_BY_YC'})


### User Intent 4: Get max number of movies each actor did with other directors (excluding Yash Chopra)

In [None]:
not_yash_chopra_movies = num_of_mov_by_actor_director[
    ~num_of_mov_by_actor_director['DIRECTOR_PID'].isin(yash_chopra_pid_df['PID'])
]
max_movies_by_other_directors = (
    not_yash_chopra_movies.groupby('ACTOR_PID')['NUM_OF_MOV']
    .max()
    .reset_index(name='MAX_NUM_OF_MOV')
)


### User Intent 5: Compare movies with Yash Chopra vs other directors

In [None]:
actors_mov_comparison = pd.merge(
    num_of_movies_by_yc,
    max_movies_by_other_directors,
    on='ACTOR_PID',
    how='left'
)

actors_mov_comparison['MORE_MOV_BY_YC'] = np.where(
    actors_mov_comparison['NUM_OF_MOV_BY_YC'] > actors_mov_comparison['MAX_NUM_OF_MOV'].fillna(0),
    'Y', 'N'
)


### User Intent 6: Count distinct actors who acted in more movies with Yash Chopra than with any other director

In [None]:
actor_pids_more_with_yc = actors_mov_comparison[
    actors_mov_comparison['MORE_MOV_BY_YC'] == 'Y'
]['ACTOR_PID'].unique()

num_actors = Person[Person['PID'].str.strip().isin(actor_pids_more_with_yc)]['PID'].nunique()
print("Number of actor:", num_actors)