# Clean Dataframes

## Clean df_128a59195de3_pin dataframe

In [None]:
# Replace empty/irrelevant entries with `Nones`

"""
Define method to replace empty/irrelevant entries with `Nones`
Parameters:
dataframe - dataframe to clean
column - name of column to replace values
entry_to_replace - entries that need to be replaced
"""
def replace_bad_entries(dataframe, column, entry_to_replace):
    dataframe = dataframe.withColumn(column, when(col(column).like(entry_to_replace), None).otherwise(col(column)))
    return dataframe

"""
Dictionary with column and entries to replace pairs
"""
column_entrytoreplace = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}

# Create loop to replace empty/irrelevant entries
for key, value in column_entrytoreplace.items():
    df_128a59195de3_pin = replace_bad_entries(df_128a59195de3_pin, key, value)

# Perform transformations of `follower_count` to ensure it is a numerical entry, data type `int`

df_128a59195de3_pin = df_128a59195de3_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_128a59195de3_pin = df_128a59195de3_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
df_128a59195de3_pin = df_128a59195de3_pin.withColumn("follower_count", col("follower_count").cast("integer"))

# Ensure that each column containing numeric data has a numeric data type
df_128a59195de3_pin = df_128a59195de3_pin.withColumn("downloaded", df_128a59195de3_pin["downloaded"].cast("integer"))
df_128a59195de3_pin= df_128a59195de3_pin.withColumn("index", df_128a59195de3_pin["index"].cast("integer"))

# Clean the data in the save_location column to include only the save location path
df_128a59195de3_pin = df_128a59195de3_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

# Rename the index column to ind.
df_128a59195de3_pin = df_128a59195de3_pin.withColumnRenamed("index", "ind")

# Change order of columns
df_128a59195de3_pin = df_128a59195de3_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")
# Drop dupliccates
df_128a59195de3_pin = df_128a59195de3_pin.dropDuplicates()
# Display changes
df_128a59195de3_pin.printSchema()

## Clean df_128a59195de3_geo dataframe

# Create new column `coordinates` - an array based on latitude and longitude columns
df_128a59195de3_geo = df_128a59195de3_geo.withColumn("coordinates", array("latitude", "longitude"))

# Drop latitude and longitude columns
df_128a59195de3_geo = df_128a59195de3_geo.drop(*["latitude", "longitude"])

# Convert the timestamp column from a string to a timestamp data type
df_128a59195de3_geo = df_128a59195de3_geo.withColumn("timestamp", to_timestamp("timestamp"))

# Reorder the DataFrame columns
df_128a59195de3_geo= df_128a59195de3_geo.select('ind', 'country', 'coordinates', 'timestamp')

# Drop dupliccates
df_128a59195de3_geo = df_128a59195de3_geo.dropDuplicates()

# Display changes
df_128a59195de3_geo.printSchema()

## Clean df_128a59195de3_user dataframe

In [None]:
# Create new column `user_name` - concatenate `first_name` and `last_name`
df_128a59195de3_user = df_128a59195de3_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))

#Drop first_name and last_name columns
df_128a59195de3_user = df_128a59195de3_user.drop(*["first_name", "last_name"])

# Convert the date_joined column from a string to a timestamp data type
df_128a59195de3_user = df_128a59195de3_user.withColumn("date_joined", to_timestamp("date_joined"))

# Reorder the DataFrame columns
df_128a59195de3_user= df_128a59195de3_user.select('ind', 'user_name', 'age', 'date_joined')

# Drop dupliccates
df_128a59195de3_user = df_128a59195de3_user.dropDuplicates()
# Display changes

df_128a59195de3_user.printSchema()