In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

In [None]:
# fetch the data 
df = pd.read_csv('../../resources/movie_metadata.csv')
df.head()

In [None]:
print(df['actor_1_facebook_likes'].isnull().sum())
print(df['actor_2_facebook_likes'].isnull().sum())
print(df['actor_3_facebook_likes'].isnull().sum())

In [None]:
def _sum_actor_facebook_likes(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Calculates the total Facebook likes for all actors in each row of a dataset.

    This function:
    - Fills missing values in the 'actor_1_facebook_likes', 'actor_2_facebook_likes', and 
      'actor_3_facebook_likes' columns with 0.
    - Sums the Facebook likes across these three columns to create a new column named 
      'actor_total_facebook_likes'.
    - Removes the original columns for individual actor Facebook likes from the DataFrame.

    Parameters:
    data : pd.DataFrame
        The input DataFrame containing 'actor_1_facebook_likes', 'actor_2_facebook_likes', 
        and 'actor_3_facebook_likes' columns.

    Returns:
    pd.DataFrame
        A modified DataFrame with:
        - A new column 'actor_total_facebook_likes' containing the sum of Facebook likes for all actors.
        - The original columns 'actor_1_facebook_likes', 'actor_2_facebook_likes', and 
          'actor_3_facebook_likes' removed.
    '''
    data['actor_1_facebook_likes'] = data['actor_1_facebook_likes'].fillna(0)
    data['actor_2_facebook_likes'] = data['actor_2_facebook_likes'].fillna(0)
    data['actor_3_facebook_likes'] = data['actor_3_facebook_likes'].fillna(0)

    data['actor_total_facebook_likes'] = (
        data['actor_1_facebook_likes'] + 
        data['actor_2_facebook_likes'] + 
        data['actor_3_facebook_likes'])

    data = data.drop(columns=['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes'])
    return data

In [None]:
df = _sum_actor_facebook_likes(df)
df

In [None]:
plt.figure(figsize=(8, 5))
plt.boxplot(df['actor_total_facebook_likes'], vert=False, patch_artist=True, boxprops=dict(facecolor="skyblue"))
plt.title("Box Plot for Total Facebook Likes", fontsize=16)
plt.xlabel("Total Facebook Likes", fontsize=12)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
Q1 = df['actor_total_facebook_likes'].quantile(0.25)
Q3 = df['actor_total_facebook_likes'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['actor_total_facebook_likes'] < lower_bound) | (df['actor_total_facebook_likes'] > upper_bound)]
print("Outliers:")
print(outliers)

In [None]:
# Calculate z-scores for actor_total_facebook_likes
df['z_score'] = zscore(df['actor_total_facebook_likes'])

# Identify outliers
outliers = df[(df['z_score'] > 3) | (df['z_score'] < -3)]
print("Outliers based on Z-scores:")
print(outliers)