In [0]:
from pyspark.sql import functions
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import IntegerType


# 1. Replace empty entries and entries with no relevant data in each column with Nones.
replacements = {
    "follower_count": ["", float('nan'), "User Info Error"],
    "poster_name": ["", float('nan'), "User Info Error"],
    "description": ["", float('nan'), "User Info Error"],
    "tag_list": ["", float('nan'), "User Info Error"],
    "save_location": ["", float('nan'), "User Info Error"],
}

def replace_values(df, replacements):
    for col_name, replace_values in replacements.items():
        for value in replace_values:
            df = df.na.replace(value, None, subset=[col_name])
    return df

df_pin = replace_values(df_pin, replacements)
# 2. Ensure every entry for follower_count is a number. Make sure the data type of this column is an int.
def convert_follower_count(df):
    df = df.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
    df = df.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
    df = df.withColumn("follower_count", col("follower_count").cast("int"))
    return df

df_pin = convert_follower_count(df_pin)
# 3. Ensure that each column containing numeric data has a numeric data type
numeric_columns = ["downloaded", "index"]

for col_name in numeric_columns:
    df_pin = df_pin.withColumn(col_name, col(col_name).cast(IntegerType()))
# 4. Clean the data in the save_location column to include only the save location path.
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# 5. Rename the index column to ind.
df_pin = df_pin.withColumnRenamed("index", "ind")
# 6. Reorder the DataFrame columns.
df_pin = df_pin.select(
    "ind", 
    "unique_id", 
    "title", 
    "description", 
    "follower_count", 
    "poster_name", 
    "tag_list", 
    "is_image_or_video", 
    "image_src", 
    "save_location", 
    "category", 
    "downloaded"
)

df_pin.printSchema()