In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import split, expr, trim, lower, explode, col, concat_ws, collect_list, sum 

spark = SparkSession.builder \
    .appName("ReadFromSpark") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/24 06:53:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# List of valid genres
genres_lst = ["Crime", "Romance", "TV Movie", "Thriller", "Adventure", "Drama", "War", "Documentary", "Family",
              "Fantasy", "History", "Mystery", "Animation", "Music", "Science Fiction", "Horror", "Western", "Comedy", "Action"]

def clean_genres_column(genres_lst, df):
    
    # Step 1: Convert genres string to array (split by comma and optional space)
    df_split = df.withColumn("genre_array", split("genres", ",\\s*"))
    
    # Step 2: Create a SQL-style array string for use in expr
    genres_array_str = "array(" + ", ".join([f"'{g}'" for g in genres_lst]) + ")"
    
    # Step 3: Filter rows where all genres are in the valid list
    df_valid = df_split.filter(expr(f"size(array_except(genre_array, {genres_array_str})) = 0"))
    
    # Step 4: Drop helper column if needed
    df_cleaned = df_valid.drop("genre_array")

    return df_cleaned
    

def clean_production_countries_column(valid_countries_df, df):
    # Step 1: Split into array
    df_split = df.withColumn("country_array", split(col("production_countries"), ",\\s*"))
    
    # Step 2: Explode into individual rows
    df_exploded = df_split.withColumn("country_raw", explode(col("country_array")))
    
    # Step 3: Normalize country values
    df_normalized = df_exploded.withColumn("country_norm", lower(trim(col("country_raw"))))
    
    # Step 4: Join with ISO list
    df_valid = df_normalized.join(
        valid_countries_df,
        df_normalized["country_norm"] == valid_countries_df["country_name"],
        how="inner"
    )
    
    # Step 5: Rebuild valid country list per movie
    df_grouped = df_valid.groupBy("id").agg(
        concat_ws(", ", collect_list("country_name")).alias("cleaned_production_countries")
    )
    
    # Step 6: Join with original dataset (to keep other columns)
    df_final = df.join(df_grouped, on="id", how="inner") \
                        .drop("production_countries") \
                        .withColumnRenamed("cleaned_production_countries", "production_countries")
    return df_final


In [3]:
READ_DIR_PATH = "/opt/bitnami/spark/resources/dataset/"
FLIE_NAME = "TMDB_movie_dataset_v11.csv"
ISOFILE_NAME = "iso_countries_cleaned.csv"
VALIDATE_FILE = READ_DIR_PATH+ISOFILE_NAME
read_file = READ_DIR_PATH + FLIE_NAME


df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(read_file)

# Step 1 Drop Columns
drop_col_list = ["backdrop_path", "homepage", "imdb_id", "original_title", "poster_path", "tagline"]
df_step1 = df.drop(*drop_col_list)

# Step 2 Drop Null Value
dropna_col_lst = ["title", "release_date", "adult", "budget", "original_language", "overview", "popularity", "genres", "production_companies", "production_countries", "spoken_languages", "keywords"]
df_step2 = df_step1.na.drop(subset=dropna_col_lst)

# Step 3 Drop Duplicate Rows
df_step3 = df_step2.dropDuplicates(["id"])

# Step 4 Change Data Type
df_step4 = df_step3.withColumn("vote_average", df_step3["vote_average"].cast(DoubleType())) \
                   .withColumn("popularity", df_step3["popularity"].cast(DoubleType())) \
                   .withColumn("vote_count", df_step3["vote_count"].cast(IntegerType())) \
                   .withColumn("revenue", df_step3["revenue"].cast(IntegerType())) \
                   .withColumn("runtime", df_step3["runtime"].cast(IntegerType())) \
                   .withColumn("budget", df_step3["budget"].cast(IntegerType())) \
                   .withColumn("adult", df_step3["adult"].cast(BooleanType()))

# Step 5 Change Date Datatype
df_step5 = df_step4.withColumn("release_date",f.to_timestamp(df_step4.release_date, 'yyyy-MM-dd'))

# Step 6 Filter Cutoff Date
cutoff_date = "2025-06-02" # Define cutoff date (end of year 2025)

# Keep only rows where release_date is less than or equal to the cutoff
df_step6 = df_step5.filter(f.col("release_date") <= f.lit(cutoff_date))

# Step 7 Filter Range Numeric Values
df_step7 = df_step6.filter( 
                            (col("vote_average") > 0.0) & \
                            (col("vote_count") > 0) & \
                            (col("revenue") > 0) & \
                            (col("runtime") > 0) & \
                            (col("budget") > 0) & \
                            (col("popularity") > 0.0)
                          )

# Step 8 Filter Status Column is Release
df_step8 = df_step7.filter(f.col("status") == "Released")

# Step 9 Cleaning Genres Column
df_step9 = clean_genres_column(genres_lst, df_step8)

# Step 10 Cleansing Production Contries
df_iso = spark.read.option("header", True).csv(VALIDATE_FILE)
df_cleaned = clean_production_countries_column(df_iso, df_step9)

                                                                                

In [8]:
# Cache the DataFrame
df_cleaned.cache()

display(df_cleaned.limit(5).toPandas())

                                                                                

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,original_language,overview,popularity,genres,production_companies,spoken_languages,keywords,production_countries
0,496,Borat: Cultural Learnings of America for Make ...,6.754,4945,Released,2006-11-01,262552893,84,False,18000000,en,Kazakh journalist Borat Sagdiyev travels to Am...,18.823,Comedy,"20th Century Fox, Everyman Pictures, Dune Ente...","Armenian, English, Hebrew, Polish, Romanian","journalist, california, prostitute, rodeo, kaz...","united kingdom, united states of america"
1,1088,Whale Rider,7.132,349,Released,2003-01-30,41400000,101,False,8000000,en,"On the east coast of New Zealand, the Whangara...",13.582,"Drama, Family","South Pacific Pictures, Pandora Film, ApolloMedia","English, Maori","diving, becoming an adult, grandparent grandch...","germany, new zealand"
2,1580,Rope,7.967,2389,Released,1948-03-11,2200000,81,False,1500000,en,Two men attempt to prove they committed the pe...,21.24,"Thriller, Crime, Drama","Transatlantic Pictures, Warner Bros. Pictures",English,"philosophy, banquet, footlocker, rope, strangl...",united states of america
3,1645,A Time to Kill,7.381,2252,Released,1996-07-24,152266007,149,False,40000000,en,A young lawyer defends a black man accused of ...,23.837,"Crime, Drama, Thriller","Regency Enterprises, Warner Bros. Pictures",English,"rape, court, jurors, ku klux klan, mississippi...",united states of america
4,2142,Cop Land,6.791,1399,Released,1997-08-15,44862187,104,False,15000000,en,Freddy Heflin is the sheriff of a place everyo...,17.632,"Crime, Drama, Mystery","Miramax, Across the River Productions, Woods E...","English, Georgian","new york city, corruption, new jersey, police,...",united states of america


In [9]:
df_cleaned.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: timestamp (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- adult: boolean (nullable = true)
 |-- budget: integer (nullable = true)
 |-- original_language: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- production_countries: string (nullable = false)



In [10]:
print((df_cleaned.count(), len(df_cleaned.columns)))



(8441, 18)


                                                                                

In [11]:
# Check Missing
dt_nulllist = df_cleaned.select([ sum(col(colname).isNull().cast("int")).alias(colname) for colname in df_cleaned.columns ])
display(dt_nulllist.toPandas())

                                                                                

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,original_language,overview,popularity,genres,production_companies,spoken_languages,keywords,production_countries
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# Check Duplicate Rows
columns_lst = ["id"]
duplicate_rows = df_cleaned.groupBy(columns_lst).count().filter(col("count") > 1)
duplicate_rows.show()

+---+-----+
| id|count|
+---+-----+
+---+-----+



In [None]:
# OUTPUT_PATH = "/opt/shared/output/cleaned_data"
# df.coalesce(1).write.mode("overwrite").csv(OUTPUT_PATH, header = True)

In [13]:
spark.stop()