In [1]:
from google.colab import files
uploaded = files.upload()

Saving movies.csv to movies.csv


In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('movies.csv')

In [None]:
df["ReleaseDate"]=pd.to_datetime(df["ReleaseDate"])
print(df.dtypes["ReleaseDate"])
print(df[["Title","ReleaseDate"]].head())

In [None]:
df["Profit"]=df["Revenue"]-df["Budget"]
print(df[["Title", "Revenue", "Budget", "Profit"]])

In [None]:
release_year=df["ReleaseDate"].dt.year
def get_era(year):
    if year < 2000:
        return "Classic"
    elif year <= 2010:
        return "Mid"
    else:
        return "Modern"
df["Era"]=release_year.apply(get_era)
print(df[["Title", "ReleaseDate", "Era"]])

In [None]:
avg_revenue=df["Revenue"].mean()
df["Revenue"]=df["Revenue"].fillna(avg_revenue)
df["Profit"]=df["Revenue"]-df["Budget"]
print(df[df["Title"]=="The Notebook"][["Title", "Revenue", "Budget", "Profit"]])

In [None]:
df_sorted=df.sort_values(by="Rating",ascending=False)
print(df_sorted[["Title","Rating"]])

In [None]:
genre_stats=df.groupby("Genre").agg({
    "Rating":"mean",
    "Revenue":"sum"
})
genre_stats=genre_stats.rename(columns={
    "Rating":"AverageRating",
    "Revenue":"Total Revenue"

})
print(genre_stats)

In [None]:
top_movies=df[(df["Rating"]>=8.5) & (df["Profit"]> 500_00_000)]
print(top_movies[["Title", "Rating", "Profit"]])


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6, 6))
plt.bar(df["Title"], df["Profit"], color="skyblue")
plt.title("Movie Profit vs Title")
plt.xlabel("Movie Title")
plt.ylabel("Profit")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieAnalysis").getOrCreate()
df_spark = spark.read.option("header", True).option("inferSchema", True).csv("movies.csv")
df_spark.show(5)

In [None]:
from pyspark.sql.functions import col
df_spark=df_spark.withColumn("Profit",col("Revenue")-col("Budget"))
df_spark.select("Title", "Revenue", "Budget", "Profit").show()

In [None]:
sci_fi_hits = df_spark.filter(
    (col("Genre") == "Sci-Fi") & (col("Profit") > 500_000_000)
)
sci_fi_hits.select("Title", "Genre", "Profit").show()

In [None]:
genre_grouped=df_spark.groupBy("Genre").agg({
    "Rating":"avg",
    "Profit":"sum"
})
genre_grouped=genre_grouped.withColumnRenamed("avg(Rating)","AverageRating")\
       .withColumnRenamed("sum(Profit)","TotalProfit")
genre_grouped.show()

In [None]:
from pyspark.sql.functions import when
df_spark = df_spark.withColumn(
    "RatingCategory",
    when(col("Rating") == 9.0, "Blockbuster")
    .when((col("Rating") >= 8.0) & (col("Rating") < 9.0), "Hit")
    .otherwise("Average")
)
df_spark.select("Title", "Rating", "RatingCategory").show()

In [None]:
df_spark=df_spark.drop("Budjet")
print(df_spark.columns)

In [None]:
avg_revenue = df_spark.selectExpr("avg(Revenue)").first()[0]
df_spark = df_spark.na.fill({"Revenue": avg_revenue})
df_spark = df_spark.withColumn("Profit", col("Revenue") - col("Budget"))

df_spark.select("Title", "Revenue", "Profit").show()

In [None]:
from pyspark.sql.functions import to_date, year, month
df_spark = df_spark.withColumn("ReleaseDate", to_date(col("ReleaseDate"), "yyyy-MM-dd"))
df_spark = df_spark.withColumn("ReleaseYear", year(col("ReleaseDate")))
df_spark = df_spark.withColumn("ReleaseMonth", month(col("ReleaseDate")))
df_spark.select("Title", "ReleaseDate", "ReleaseYear", "ReleaseMonth").show()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
genre_window=Window.partitionBy("Genre").orderBy(col("Profit").desc())
df_spark=df_spark.withColumn("GenreProfitRank",rank().over(genre_window))
df_spark.select("Title", "Genre", "Profit", "GenreProfitRank").show()

In [26]:
import json
bonus_movies = [
    {"MovieID": 8, "Title": "Barbie", "Genre": "Drama", "Rating": 7.5, "Revenue": 1456000000, "Budget": 145000000},
    {"MovieID": 9, "Title": "Oppenheimer", "Genre": "Drama", "Rating": 8.6, "Revenue": 950000000, "Budget": 100000000}
]
with open("movies_bonus.json", "w") as f:
    json.dump(bonus_movies, f, indent=4)

In [None]:
df_bonus = spark.read.option("multiline", True).json("movies_bonus.json")
df_bonus.show()

In [None]:
columns_to_keep = ["MovieID", "Title", "Genre", "Rating", "Revenue", "Budget"]
df_main_clean=df_spark.select(*columns_to_keep)
df_bonus_clean=df_spark.select(*columns_to_keep)
df_merged=df_main_clean.union(df_bonus_clean)
df_merged.show()

In [None]:
df_merged.orderBy(col("Revenue").desc()).select("Title","Revenue").show(5)

In [30]:
df_merged.write.mode("overwrite").option("header", True).csv("final_movies_output")


In [None]:
from pyspark.sql.functions import col

df_spark = df_spark.withColumn("ReturnOnInvestment", col("Profit") / col("Budget"))
df_spark.select("Title", "Profit", "Budget", "ReturnOnInvestment").show()

In [None]:
top_roi_spark = df_spark.filter(
    (col("Budget") >= 50000000)
).orderBy(col("ReturnOnInvestment").desc())

top_roi_spark.select("Title", "Genre", "Budget", "Profit", "ReturnOnInvestment").show(5)


In [33]:
df_pandas = df_spark.select("Title", "Genre", "Budget", "Profit", "ReturnOnInvestment").toPandas()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df_filtered = df[df["Budget"] >= 50000000]
genre_roi = df_filtered.groupby("Genre")["ROI"].mean().sort_values(ascending=False)
plt.figure(figsize=(8, 5))
genre_roi.plot(kind="bar", color="mediumseagreen")
plt.title("Average ROI by Genre ")
plt.xlabel("Genre")
plt.ylabel("ROI")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
