In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

In [None]:
# fetch the data 
df = pd.read_csv("../../resources/movie_metadata.csv")
df.head()

In [None]:
print(df['actor_1_name'].value_counts().head(10))
print(df['actor_2_name'].value_counts().head(10))
print(df['actor_3_name'].value_counts().head(10))

In [None]:
print(df['actor_1_name'].isnull().sum())
print(df['actor_3_name'].isnull().sum())
print(df['actor_3_name'].isnull().sum())

In [None]:
df['actor_1_name'] = df['actor_1_name'].fillna('unknown_actor_1_name')
df['actor_2_name'] = df['actor_2_name'].fillna('unknown_actor_2_name')
df['actor_3_name'] = df['actor_3_name'].fillna('unknown_actor_3_name')

In [None]:
print(df['actor_1_name'].isnull().sum())
print(df['actor_3_name'].isnull().sum())
print(df['actor_3_name'].isnull().sum())

In [None]:
def _actor_frequency(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Calculates the frequency of actors in a dataset and creates a total frequency column for each row.

    This function:
    - Fills missing values in the 'actor_1_name', 'actor_2_name', and 'actor_3_name' columns with unique placeholders.
    - Combines all actor columns into a single series to compute the frequency of each actor.
    - Maps the actor frequencies to individual columns for each actor (actor_1_frequency, actor_2_frequency, actor_3_frequency).
    - Calculates a new column, 'total_actor_frequency', as the sum of the frequencies of all three actors in each row.
    - Removes the original actor name columns and the intermediate frequency columns from the DataFrame.

    Parameters:
    data : pd.DataFrame
        The input DataFrame containing 'actor_1_name', 'actor_2_name', and 'actor_3_name' columns.

    Returns:
    pd.DataFrame
        A modified DataFrame with:
        - A 'total_actor_frequency' column representing the combined frequency of all three actors in each row.
        - The original actor name columns ('actor_1_name', 'actor_2_name', 'actor_3_name') removed.
        - The intermediate actor frequency columns ('actor_1_frequency', 'actor_2_frequency', 'actor_3_frequency') removed.
    '''    
    data['actor_1_name'] = data['actor_1_name'].fillna('unknown_actor_1_name')
    data['actor_2_name'] = data['actor_2_name'].fillna('unknown_actor_2_name')
    data['actor_3_name'] = data['actor_3_name'].fillna('unknown_actor_3_name')
    all_actors = pd.concat([data['actor_1_name'], data['actor_2_name'], data['actor_3_name']])
    actor_frequencies = all_actors.value_counts()
    data['actor_1_frequency'] = data['actor_1_name'].map(actor_frequencies)
    data['actor_2_frequency'] = data['actor_2_name'].map(actor_frequencies)
    data['actor_3_frequency'] = data['actor_3_name'].map(actor_frequencies)
    data['total_actor_frequency'] = data['actor_1_frequency'] + data['actor_2_frequency'] + data['actor_3_frequency']
    data = data.drop(columns=['actor_1_name','actor_2_name','actor_3_name'])
    data = data.drop(columns=['actor_1_frequency','actor_2_frequency','actor_3_frequency'])
    return data

In [None]:
# preprocess actor experiences
test_df = _actor_frequency(df) #test only 
test_df

In [None]:
# Plot actor frequencies
plt.figure(figsize=(10, 6))

# Plot each frequency column as a line
plt.plot(df.index, df['actor_1_frequency'], marker='o', label='Actor 1 Frequency')
plt.plot(df.index, df['actor_2_frequency'], marker='s', label='Actor 2 Frequency')
plt.plot(df.index, df['actor_3_frequency'], marker='^', label='Actor 3 Frequency')

# Add labels, title, and legend
plt.title("Actor Frequencies by Role", fontsize=16)
plt.xlabel("Row Index", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(df.index)
plt.legend(title="Actor Role", fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()