In [0]:
from pyspark.sql import functions
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType
import pandas as pd
import numpy as np


# 1. Replace empty entries and entries with no relevant data in each column with Nones.
df_pin = df_pin.na.replace([''], None)
df_pin = df_pin.na.replace([float('nan')], None)
df_pin = df_pin.na.replace(['User Info Error'], None)
# 2. Ensure every entry for follower_count is a number. Make sure the data type of this column is an int.
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast("int"))
# 3. Ensure that each column containing numeric data has a numeric data type
df_pin = df_pin.withColumn("downloaded", col("downloaded").cast("int"))
df_pin = df_pin.withColumn("index", col("index").cast("int"))
# 4. Clean the data in the save_location column to include only the save location path.
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# 5. Rename the index column to ind.
df_pin = df_pin.withColumnRenamed("index", "ind")
# 6. Reorder the DataFrame columns.
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category", "downloaded")

displayHTML("<h2>df_pin:</h2>")
display(df_pin)
df_pin.printSchema()