Step 1: Setting Up the Environment


In [105]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, size
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField, ArrayType
import pandas as pd

# Initialize Spark Session
spark = SparkSession.builder.appName("TMDB_Movie_Analysis").getOrCreate()

# Load the dataset
df = spark.read.csv("dataset/tmdb_5000_movies.csv", header=True, inferSchema=True)

# Display the schema
df.printSchema()

root
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)



Step 2: Exploratory Data Analysis (EDA)


In [106]:
# Show the first few rows
df.show(5)

# Summary statistics
df.describe().show()

# Handle missing values
df = df.dropna()

# Save the cleaned data for EDA
df.toPandas().to_csv("data/eda_cleaned.csv", index=False)

+---------+-------------+--------------------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------+--------------------+--------------+--------------------+---------------+--------------------+----------------+
|   budget|       genres|            homepage|          id|            keywords|original_language|      original_title|            overview|          popularity|production_companies|production_countries|   release_date|             revenue|       runtime|    spoken_languages|        status|             tagline|          title|        vote_average|      vote_count|
+---------+-------------+--------------------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------+-----------

Step 3: Feature Engineering and Data Transformation


In [107]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Define schemas for nested JSON columns
genres_schema = ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())]))
keywords_schema = ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())]))

# Parse JSON columns
df = df.withColumn("genres_parsed", from_json(col("genres"), genres_schema))
df = df.withColumn("keywords_parsed", from_json(col("keywords"), keywords_schema))

# Extract relevant features by counting the number of elements in the arrays
df = df.withColumn("num_genres", size(col("genres_parsed")))
df = df.withColumn("num_keywords", size(col("keywords_parsed")))

# Convert columns to appropriate data types
df = df.withColumn("budget", col("budget").cast(IntegerType()))
df = df.withColumn("popularity", col("popularity").cast(FloatType()))
df = df.withColumn("runtime", col("runtime").cast(FloatType()))
df = df.withColumn("vote_average", col("vote_average").cast(FloatType()))
df = df.withColumn("vote_count", col("vote_count").cast(IntegerType()))
df = df.withColumn("revenue", col("revenue").cast(IntegerType()))

# Handle any remaining null values in numeric columns by filling them with 0
numeric_columns = ["budget", "popularity", "runtime", "vote_average", "vote_count", "revenue"]
for column in numeric_columns:
    df = df.fillna({column: 0})

# Check for any non-numeric values in numeric columns and remove or correct them
for column in numeric_columns:
    df = df.filter(col(column).cast("float").isNotNull())

# Convert categorical columns to numerical values
indexer = StringIndexer(inputCol="original_language", outputCol="original_language_index")
df = indexer.fit(df).transform(df)

# Assemble features into a single vector column
feature_cols = ["budget", "num_genres", "num_keywords", "original_language_index", "popularity", "runtime", "vote_average", "vote_count"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Create a new column 'success' based on revenue thresholds
df = df.withColumn("success", (col("revenue") > 10000000).cast(IntegerType()))

# Split the data into training and testing sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Save the transformed data for feature engineering
df.toPandas().to_csv("data/feature_engineering.csv", index=False)

Step 4: Regression Analysis


In [110]:
from pyspark.ml.regression import LinearRegression

# Initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="revenue")

# Fit the model
lr_model = lr.fit(train_df)

# Predict on the test set
test_results = lr_model.transform(test_df)

# Evaluate the model
test_results.select("revenue", "prediction").show()
rmse = lr_model.summary.rootMeanSquaredError
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the regression results
test_results.toPandas().to_csv("data/regression_results.csv", index=False)

+-------+-------------------+
|revenue|         prediction|
+-------+-------------------+
|      0|-1682.8669106839493|
|      0| 2555.0341053063316|
|      0|-102.17071179368406|
|      0| 1936.4742828885505|
|      0| 224.86988108016374|
|      0|-1825.9471700662584|
|      0|  1741.063025291056|
|      0|  294.8921739999329|
|      0|  2282.500277911458|
|      0|  2343.820389075305|
|      0| 2459.6472657181257|
|      0| 1444.4587586722223|
|      0|  2104.906899329283|
|      0| 2595.9141794155626|
|      0| -1505.719922877282|
|      0|  2286.959232283892|
|      0| 2480.0863920871834|
|      0|-1195.7743983685682|
|      0|  2591.702123144962|
|      0|-2015.8551352068844|
+-------+-------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE): 19535.48535990419


Step 5: Classification Analysis


In [111]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize the RandomForestClassifier with increased maxBins
rf = RandomForestClassifier(featuresCol="features", labelCol="success", maxBins=800)

# Fit the model
rf_model = rf.fit(train_df)

# Predict on the test set
test_results = rf_model.transform(test_df)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="success", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(test_results)
print(f"Accuracy: {accuracy}")

# Save the classification results
test_results.toPandas().to_csv("data/classification_results.csv", index=False)

# Show the classification results
test_results.select("success", "prediction", "probability").show()

Accuracy: 1.0
+-------+----------+-----------+
|success|prediction|probability|
+-------+----------+-----------+
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
|      0|       0.0|      [1.0]|
+-------+----------+-----------+
only showing top 20 rows

