✅ TASK 1: LOAD & EXPLORE

    1️⃣ Load CSV using pd.read_csv()
    2️⃣ Show first 5 rows
    3️⃣ Show last 5 rows
    4️⃣ Display:

        Shape

        Column names

        Data types
    5️⃣ Summary statistics (describe numeric columns)
    6️⃣ Count missing values per column

✅ TASK 2: DATA CLEANING

    1️⃣ Fill missing numeric values → mean
    2️⃣ Fill missing categorical values → mode
    3️⃣ Remove duplicate rows
    4️⃣ Strip whitespace from Title & Genre
    5️⃣ Convert Genre and Director → category type

✅ TASK 3: FILTERING & SORTING

    1️⃣ Movies with Rating > 8.5
    2️⃣ Movies released after 2010
    3️⃣ Revenue > 500 million
    4️⃣ Sort by Revenue_Millions descending
    5️⃣ Sort by Rating then Revenue

✅ TASK 4: NEW CALCULATED COLUMNS

    1️⃣ Profit_Category:

        High → Revenue > 1000

        Medium → Revenue 500–1000

        Low → Revenue < 500

    2️⃣ Runtime_Category:

        Short → Runtime < 100

        Medium → 100–150

        Long → >150

✅ TASK 5: GROUPING & AGGREGATION

    1️⃣ Average revenue by Genre
    2️⃣ Average rating by Director
    3️⃣ Total revenue by Year
    4️⃣ Count of movies per Genre
    5️⃣ Top 3 highest revenue movies

✅ TASK 6: PIVOT TABLES

    1️⃣ Average Rating × Genre × Director
    2️⃣ Total Revenue × Year × Genre
    3️⃣ Number of movies per Director × Genre

✅ TASK 7: ADVANCED INSIGHTS

    1️⃣ Most popular genre (by revenue)
    2️⃣ Highest rated movie
    3️⃣ Movie with highest revenue
    4️⃣ Director with highest average rating
    5️⃣ Year with most movies released


In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("movies.csv")


In [None]:
# ✅ TASK 1: LOAD & EXPLORE

#     1️⃣ Load CSV using pd.read_csv()

df = pd.read_csv("movies.csv")


#     2️⃣ Show first 5 rows

print(df.head())

#     3️⃣ Show last 5 rows

print(df.tail())

#     4️⃣ Display:

#         Shape

print(df.shape)

#         Column names

print(df.columns)

#         Data types

print(df.dtypes)

#     5️⃣ Summary statistics (describe numeric columns)

print(df.describe())

#     6️⃣ Count missing values per column

print(df.isnull().sum())


In [None]:
# ✅ TASK 2: DATA CLEANING

#     1️⃣ Fill missing numeric values → mean

df = df.fillna(df.mean(numeric_only=True))

#     2️⃣ Fill missing categorical values → mode

cat_col = df.select_dtypes(include="object").columns

for col in cat_col:
    df[col]=df[col].fillna(df[col].mode()[0])

#     3️⃣ Remove duplicate rows

df = df.drop_duplicates()

#     4️⃣ Strip whitespace from Title & Genre

df["Title"] = df["Title"].str.strip()

df["Genre"] = df["Genre"].str.strip()

#     5️⃣ Convert Genre and Director → category type

df["Genre"] = df["Genre"].astype("category")
df["Director"] = df["Director"].astype("category")


In [None]:
# ✅ TASK 3: FILTERING & SORTING

#     1️⃣ Movies with Rating > 8.5

mov_rat_8_5 = df[df["Rating"]>8.5]

#     2️⃣ Movies released after 2010

mov_rel_af_2010 = df[df["Year"]>2010] 


#     3️⃣ Revenue > 500 million

rev = df[df["Revenue_Millions"]>500]

#     4️⃣ Sort by Revenue_Millions descending

sorted_rev_dec = df.sort_values(by="Revenue_Millions",ascending=False)

#     5️⃣ Sort by Rating then Revenue

sorted_rat_rev = df.sort_values(by=["Rating","Revenue_Millions"],  ascending=[True, False])
print(sorted_rat_rev)


In [None]:
# ✅ TASK 4: NEW CALCULATED COLUMNS

#     1️⃣ Profit_Category:

#         High → Revenue > 1000

#         Medium → Revenue 500–1000

#         Low → Revenue < 500

df["Profit_Category"] = pd.cut(
    df["Revenue_Millions"],
    bins = [0,500,1000,float("inf")],
    labels=["Low","Medium","High"],
    right=False
)

#     2️⃣ Runtime_Category:

#         Short → Runtime < 100

#         Medium → 100–150

#         Long → >150

df["Runtime_Category"]= pd.cut(
    df["Runtime"],
    bins=[0,100,150,float("inf")],
    labels=["Short","Medium","Long"],
    right=False
)


In [None]:
# ✅ TASK 5: GROUPING & AGGREGATION

#     1️⃣ Average revenue by Genre

avg_rev_gen = df.groupby("Genre",observed=False)["Revenue_Millions"].mean()

#     2️⃣ Average rating by Director

avg_rat_dir = df.groupby("Director",observed=False)["Rating"].mean()
#     3️⃣ Total revenue by Year

total_rev = df.groupby("Year",observed=False)["Revenue_Millions"].sum()

#     4️⃣ Count of movies per Genre

count_per_gen = df.groupby("Genre",observed=False)["Title"].count()

#     5️⃣ Top 3 highest revenue movies

top_3_highest = df.nlargest(3,"Revenue_Millions")



In [None]:
# ✅ TASK 6: PIVOT TABLES

#     1️⃣ Average Rating × Genre × Director


avg = df.pivot_table(
    values="Rating",
    index="Genre",
    columns="Director",
    aggfunc="mean"
).fillna(0)

#     2️⃣ Total Revenue × Year × Genre

total  = df.pivot_table(index="Genre",values="Revenue_Millions",aggfunc="sum",columns="Year",observed=False)

#     3️⃣ Number of movies per Director × Genre

no_movies = df.pivot_table(index="Director",values="Title",columns="Genre",aggfunc="count",observed=False)
print(no_movies)


In [None]:
# ✅ TASK 7: ADVANCED INSIGHTS

#     1️⃣ Most popular genre (by revenue)

most_pop_gen = df.groupby("Genre",observed=False)["Revenue_Millions"].sum().nlargest(1)


#     2️⃣ Highest rated movie

highest_rated = df.iloc[df["Rating"].idxmax()]

#     3️⃣ Movie with highest revenue

mov_high_rev = df.loc[df["Revenue_Millions"].idxmax()]


#     4️⃣ Director with highest average rating

drtr_high_avg =  df.groupby("Director",observed=False)["Rating"].mean().nlargest(1)

#     5️⃣ Year with most movies released

year_most_mov = df.groupby("Year",observed=False)["Movie_ID"].count().nlargest(1)

