In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, to_timestamp

# Initialize Spark session
spark = SparkSession.builder.appName("Clean CSV Data with Age Handling").getOrCreate()

# Read the CSV file
df = spark.read.option("header", True).csv("/home/jovyan/data/Social.csv")



In [2]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- username: string (nullable = true)
 |-- age: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date_created: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- post_text: string (nullable = true)
 |-- location: string (nullable = true)
 |-- post_timestamp: string (nullable = true)
 |-- shares: string (nullable = true)
 |-- angry: string (nullable = true)
 |-- haha: string (nullable = true)
 |-- like: string (nullable = true)
 |-- love: string (nullable = true)
 |-- sad: string (nullable = true)
 |-- wow: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- comment_timestamp: string (nullable = true)
 |-- comment_user_id: string (nullable = true)



In [3]:
# Convert fields to the correct data types
df = df.withColumn("age", df["age"].cast("integer"))
df = df.withColumn("date_created",to_timestamp("date_created", "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("shares", df["shares"].cast("integer"))
df = df.withColumn("angry", df["angry"].cast("integer"))
df = df.withColumn("haha", df["haha"].cast("integer"))
df = df.withColumn("like", df["like"].cast("integer"))
df = df.withColumn("love", df["love"].cast("integer"))
df = df.withColumn("sad", df["sad"].cast("integer"))
df = df.withColumn("wow", df["wow"].cast("integer"))
df = df.withColumn("post_timestamp", to_timestamp("post_timestamp", "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("comment_timestamp", to_timestamp("comment_timestamp", "yyyy-MM-dd HH:mm:ss"))



In [4]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- username: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date_created: timestamp (nullable = true)
 |-- post_id: string (nullable = true)
 |-- post_text: string (nullable = true)
 |-- location: string (nullable = true)
 |-- post_timestamp: timestamp (nullable = true)
 |-- shares: integer (nullable = true)
 |-- angry: integer (nullable = true)
 |-- haha: integer (nullable = true)
 |-- like: integer (nullable = true)
 |-- love: integer (nullable = true)
 |-- sad: integer (nullable = true)
 |-- wow: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- comment_timestamp: timestamp (nullable = true)
 |-- comment_user_id: string (nullable = true)



In [6]:
# Drop rows with missing critical fields like user_id, post_id
df_cleaned = df.na.drop(subset=["user_id", "post_id"])

# Fill null values in reaction columns with 0
df_cleaned = df_cleaned.fillna({
    "shares": 0,
    "angry": 0,
    "haha": 0,
    "like": 0,
    "love": 0,
    "sad": 0,
    "wow": 0
})

# Fill missing comments with an empty string
df_cleaned = df_cleaned.fillna({
    "comment_text": "",
    "comment_user_id": ""
})

# Calculate the median age
age_median = df.approxQuantile("age", [0.5], 0.01)[0]

# Fill missing ages with the median age
df_cleaned = df_cleaned.fillna({"age": age_median})

# Show the cleaned DataFrame
df_cleaned.show(truncate=False)




+------------------------------------+--------------+---+------------------------+------+-------------+-------------------+------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-------------------+------+-----+----+----+----+---+---+--------------------+-----------------------------------------------------+-------------------+------------------------------------+
|user_id                             |username      |age|email                   |gender|name         |date_created       |post_id                             |post_text                                                                                                                                                    |location|post_timestamp     |shares|angry|haha|like|love|sad|wow|tags                |comment_text                                         |comment_timestamp  |commen

In [7]:
# Save the cleaned data to a new CSV file
df_cleaned.write.option("header", True).csv("/home/jovyan/data/Social_cleaned.csv")