In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

Data Preparation


In [2]:
df = pd.read_csv('data/movies.csv')

# Data Cleaning

# Check for Null Values in the target rows
print(df[['Rating', 'Genre']].isnull().sum())
cleaned_df = df.dropna(subset=["Rating"])

# Check for Duplicates
print(cleaned_df.duplicated().sum())

# Remove duplicates if theres any
cleaned_df = df.drop_duplicates()



Rating    1
Genre     0
dtype: int64
0


Data Analytics


In [13]:
import plotly.express as px

# 1. Explode the genres correctly
# Note: Ensure you use cleaned_df consistently
df_expanded = cleaned_df.assign(
    genre=cleaned_df['Genre'].str.split('|')
).explode('genre')

# 

# 2. Identify the Top 10 genres by frequency
top10_genre_names = df_expanded['genre'].value_counts().head(10).index

# 3. Filter the expanded data to only include these top 10
# Filter based on the 'genre' column, not 'Genre'
df_top10 = df_expanded[df_expanded['genre'].isin(top10_genre_names)]

# 4. Aggregate the data to get the AVERAGE rating per genre
# This prevents Plotly from summing ratings
df_plot = df_top10.groupby(['genre'], as_index=False)['Rating'].mean()

# 5. Create the Interactive Bar Graph
fig = px.bar(df_plot,
             x="genre",  # Fixed typo
             y="Rating",
             title="Top 10 Movie Genres by Average Rating",
             color="genre", # Optional: colors each bar differently
             labels={"genre": "Movie Genre", "Movie Rating": "Average Rating"})

fig.show()