In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from fpdf import FPDF
import ast

# Step 1: Load dataset with safe type handling
df = pd.read_csv("movies_metadata.csv", low_memory=False)

# Step 2: Convert key columns to numeric
cols_to_convert = ['popularity', 'runtime', 'vote_average', 'revenue', 'budget']
for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 3: Drop rows with missing values in key columns
df = df.dropna(subset=["title", "runtime", "vote_average", "genres", "release_date", "revenue", "budget", "popularity"])

# Step 4: Extract most popular and highest rated movies safely
most_popular_title = df.loc[df["popularity"].idxmax(), "title"]
highest_rated_title = df.loc[df["vote_average"].idxmax(), "title"]

# Step 5: Clean 'genres' column (extract first genre name)
def extract_genre(genre_string):
    try:
        genre_list = ast.literal_eval(genre_string)
        if isinstance(genre_list, list) and genre_list:
            return genre_list[0].get('name', 'Unknown')
    except:
        pass
    return "Unknown"

df["Main Genre"] = df["genres"].apply(extract_genre)

# Step 6: Plot - Average rating by genre
plt.figure(figsize=(10, 6))
df.groupby("Main Genre")["vote_average"].mean().sort_values(ascending=False).plot(kind="bar", color="teal")
plt.title("Average Rating by Genre")
plt.xlabel("Genre")
plt.ylabel("Average Rating")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("rating_by_genre.png")
plt.close()

# Step 7: Plot - Top 10 Most Popular Movies
top_popular = df.sort_values("popularity", ascending=False).head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_popular["title"], top_popular["popularity"], color="orange")
plt.xlabel("Popularity")
plt.title("Top 10 Most Popular Movies")
plt.tight_layout()
plt.savefig("top10_popular.png")
plt.close()

# Step 8: Generate PDF Report
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 14)
        self.cell(0, 10, "Movie Dataset Analysis Report", ln=True, align="C")

    def footer(self):
        self.set_y(-15)
        self.set_font("Arial", "I", 8)
        self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C")

    def chapter_title(self, title):
        self.set_font("Arial", "B", 12)
        self.cell(0, 10, title, ln=True)

    def chapter_body(self, body):
        self.set_font("Arial", "", 12)
        self.multi_cell(0, 10, body)

# Step 9: Add content to PDF
pdf = PDF()
pdf.add_page()

# Summary stats
summary = f"""
Total Movies Analyzed: {len(df)}
Average IMDB Rating: {df["vote_average"].mean():.2f}
Average Runtime: {df["runtime"].mean():.2f} minutes
Most Popular Movie: {most_popular_title}
Highest Rated Movie: {highest_rated_title}
"""

pdf.chapter_title("Summary Statistics")
pdf.chapter_body(summary)

# Add plots
pdf.chapter_title("Chart: Average Rating by Genre")
pdf.image("rating_by_genre.png", w=180)

pdf.chapter_title("Chart: Top 10 Most Popular Movies")
pdf.image("top10_popular.png", w=180)

# Step 10: Save PDF
pdf.output("movie_dataset_report.pdf")
print("✅ Report generated successfully: movie_dataset_report.pdf")


# task2_movie_report/
# ├── movies_metadata.csv
# ├── task2_report_generator.py
# ├── rating_by_genre.png
# ├── top10_popular.png
# └── movie_dataset_report.pdf ✅


✅ Report generated successfully: movie_dataset_report.pdf
