In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

In [None]:
# fetch the data 
df = pd.read_csv("../../resources/movie_metadata.csv")
df.head()

In [None]:
df['content_rating'].value_counts()

In [None]:
df['content_rating'].isnull().sum() 
# there are NaN this will be fixed in preproc.bucket_contentRatings

In [None]:
# plot the distribution of all content_rating's
rating_distribution = df['content_rating'].value_counts()
plt.figure(figsize=(8, 5))
rating_distribution.plot(kind='bar', alpha=0.7, color='skyblue')
plt.title("Distribution of Rating Bin", fontsize=16)
plt.xlabel("Rating Bin", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
def _bucket_contentRatings(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Groups content ratings in a Pandas DataFrame into buckets based on their percentage frequency.

    This function processes the 'content_rating' column by:
    - Filling missing values with "other".
    - Calculating the percentage frequency of each unique content rating.
    - Assigning content ratings with less than 10% occurrence to an "other" bucket.
    - Replacing the original 'content_rating' column with a new column named 'rating_bin', which 
      contains the binned content ratings.

    Parameters:
    data : pd.DataFrame
        The input DataFrame containing a 'content_rating' column to process.

    Returns:
    pd.DataFrame
        A modified DataFrame where:
        - The original 'content_rating' column is replaced by a new 'rating_bin' column.
        - Ratings with less than 10% occurrence are grouped into an "other" category.
    '''    
    content_rating_df = data[['content_rating']].copy()
    content_rating_df['content_rating'] = content_rating_df['content_rating'].fillna("other")
    total_count = content_rating_df['content_rating'].value_counts().sum()
    content_rating_df['percentage'] = content_rating_df['content_rating'].map(content_rating_df['content_rating'].value_counts()) / total_count * 100
    content_rating_df["rating_bin"] = content_rating_df["content_rating"].where(content_rating_df["percentage"] >= 10, "other")
    content_rating_df.drop(columns=['content_rating','percentage'], inplace=True)
    data.drop(columns=['content_rating'], inplace=True)
    return pd.concat([data, content_rating_df], axis=1)

In [None]:
# call function to replace content_rating with a bucketed version
content_rating_replaced_df = _bucket_contentRatings(df)

In [None]:
content_rating_replaced_df['rating_bin'].isnull().sum() 

In [None]:
content_rating_replaced_df['rating_bin'].value_counts()

In [None]:
content_rating_replaced_df.head()

In [None]:
# plot the distribution of the new rating bins
rating_distribution = content_rating_replaced_df['rating_bin'].value_counts()
plt.figure(figsize=(8, 5))
rating_distribution.plot(kind='bar', alpha=0.7, color='skyblue')
plt.title("Distribution of Rating Bin", fontsize=16)
plt.xlabel("Rating Bin", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=0, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# now that we have the new rating_bin column, we can encode them using one hot encoding
encoder = OneHotEncoder(sparse_output=False)  
content_rating_encoded = encoder.fit_transform(content_rating_replaced_df[["rating_bin"]])
content_rating_encoded = pd.DataFrame(content_rating_encoded, columns=encoder.get_feature_names_out(["rating_bin"]))
content_rating_encoded

In [None]:
# add the encoded columns to the dataframe and drop the original content_rating column
df = pd.concat([df, content_rating_encoded], axis=1)
df.head()