Step 1: Setting Up the Environment


In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, size
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField, ArrayType
import pandas as pd

# Initialize Spark Session
spark = SparkSession.builder.appName("TMDB_Movie_Analysis").getOrCreate()

# Load the dataset
df = spark.read.csv("dataset/tmdb_5000_movies.csv", header=True, inferSchema=True)

# Display the schema
df.printSchema()

root
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)



Step 2: Exploratory Data Analysis (EDA)


In [20]:
# Show the first few rows
df.show(5)

# Summary statistics
df.describe().show()

# Handle missing values
df = df.dropna()

# Save the cleaned data for EDA
df.toPandas().to_csv("data/eda_cleaned.csv", index=False)

+---------+-------------+--------------------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------+--------------------+--------------+--------------------+---------------+--------------------+----------------+
|   budget|       genres|            homepage|          id|            keywords|original_language|      original_title|            overview|          popularity|production_companies|production_countries|   release_date|             revenue|       runtime|    spoken_languages|        status|             tagline|          title|        vote_average|      vote_count|
+---------+-------------+--------------------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------+-----------

Step 3: Feature Engineering and Data Transformation


In [21]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Define schemas for nested JSON columns
genres_schema = ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())]))
keywords_schema = ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())]))

# Parse JSON columns
df = df.withColumn("genres_parsed", from_json(col("genres"), genres_schema))
df = df.withColumn("keywords_parsed", from_json(col("keywords"), keywords_schema))

# Extract relevant features by counting the number of elements in the arrays
df = df.withColumn("num_genres", size(col("genres_parsed")))
df = df.withColumn("num_keywords", size(col("keywords_parsed")))

# Convert columns to appropriate data types
df = df.withColumn("budget", col("budget").cast(IntegerType()))
df = df.withColumn("popularity", col("popularity").cast(FloatType()))
df = df.withColumn("runtime", col("runtime").cast(FloatType()))
df = df.withColumn("vote_average", col("vote_average").cast(FloatType()))
df = df.withColumn("vote_count", col("vote_count").cast(IntegerType()))
df = df.withColumn("revenue", col("revenue").cast(IntegerType()))

# Handle any remaining null values in numeric columns by filling them with 0
numeric_columns = ["budget", "popularity", "runtime", "vote_average", "vote_count", "revenue"]
for column in numeric_columns:
    df = df.fillna({column: 0})

# Check for any non-numeric values in numeric columns and remove or correct them
for column in numeric_columns:
    df = df.filter(col(column).cast("float").isNotNull())

# Convert categorical columns to numerical values
indexer = StringIndexer(inputCol="original_language", outputCol="original_language_index")
df = indexer.fit(df).transform(df)

# Assemble features into a single vector column
feature_cols = ["budget", "num_genres", "num_keywords", "original_language_index", "popularity", "runtime", "vote_average", "vote_count"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Create a new column 'success' based on revenue thresholds
df = df.withColumn("success", (col("revenue") > 10000000).cast(IntegerType()))

# Split the data into training and testing sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Save the transformed data for feature engineering
df.toPandas().to_csv("data/feature_engineering.csv", index=False)

Step 4: Regression Analysis


In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="revenue")

# Fit the model
lrModel = lr.fit(train_df)

# Make predictions on test data
test_results = lrModel.transform(test_df)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="revenue", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(test_results)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the regression results
test_results.toPandas().to_csv("data/lr_regression_results.csv", index=False)

AssertionError: 

Step 5: Classification Analysis


In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Ensure necessary columns are cast to the correct types
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Selecting relevant features for prediction
selected_columns = ['funding_total_usd', 'founded_year', 'is_successful']
df = df.select(selected_columns)

# Handle missing values
df = df.na.drop()

# VectorAssembler to combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols=['funding_total_usd', 'founded_year'], outputCol='features')

# Check if 'features' column already exists and drop it if necessary
if 'features' in df.columns:
    df = df.drop('features')

df = assembler.transform(df)

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Logistic Regression model
lr = LogisticRegression(labelCol='is_successful', featuresCol='features')
model = lr.fit(train_df)

# Predictions on the test set
predictions = model.transform(test_df)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='is_successful')
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Convert predictions to Pandas DataFrame and save for Tableau visualization
predictions_pd = predictions.select("prediction", "probability", "is_successful").toPandas()
predictions_pd.to_csv('/mnt/data/predictions_investments_VC.csv', index=False)


NameError: name 'df' is not defined