In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

In [None]:
# fetch the data 
df = pd.read_csv('../../resources/movie_metadata.csv')
df.head()

In [None]:
df['genres']

In [None]:
df['genres'].value_counts()

In [None]:
df['genres'].isnull().sum() 

In [None]:
def _process_genres(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Processes the 'genres' column in a Pandas DataFrame by splitting genres into individual binary columns
    and grouping less frequent genres into an "other_genre" category.

    This function:
    - Fills missing values in the 'genres' column with "other_genre".
    - Splits the pipe-separated genres into lists.
    - Identifies genres that appear in more than 10% of the rows as "frequent genres".
    - Creates binary columns for each frequent genre, where 1 indicates the presence of the genre in the row.
    - Creates an "other_genre" binary column to indicate the presence of infrequent genres.
    - Drops the original 'genres' column from the DataFrame.

    Parameters:
    data : pd.DataFrame
        The input DataFrame containing a 'genres' column to process. The column should contain 
        pipe-separated genre strings (e.g., "Action|Comedy|Drama").

    Returns:
    pd.DataFrame
        A modified DataFrame with:
        - Binary columns for each frequent genre.
        - An "other_genre" column for infrequent genres.
        - The original 'genres' column removed.
    '''    
    data['genres'] = data['genres'].fillna("other_genre")
    data['genres'] = data['genres'].str.split('|')
    all_genres = [genre for sublist in data['genres'] for genre in sublist]
    genre_counts = pd.Series(all_genres).value_counts()
    threshold = len(data) * 0.1
    frequent_genres = genre_counts[genre_counts > threshold].index
    for genre in frequent_genres:
        data[genre] = data['genres'].apply(lambda x: genre in x).astype(int)

    data['other_genre'] = data['genres'].apply(lambda x: any(genre not in frequent_genres for genre in x)).astype(int)
    data = data.drop(columns=['genres'])
    return data

In [None]:
df = _process_genres(df)
df


In [None]:
columns_to_plot = [
    "Drama", "Comedy", "Thriller", "Action", "Romance", 
    "Adventure", "Crime", "Sci-Fi", "Fantasy", "Horror", 
    "Family", "other_genre"
]
genre_counts = df[columns_to_plot].sum()
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar', alpha=0.8, color='mediumpurple')
plt.title("Genre Distribution", fontsize=16)
plt.xlabel("Genres", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()
