In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Initialize Spark session
spark = SparkSession.builder.appName("YouTubeStatisticsAnalysis").getOrCreate()

# Set AWS credentials for Spark session
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", "**************")
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "****************")

# Read CSV data from S3 bucket
bucket_name = 'your_bucket_name'
object_key = 'object_key_name'
s3_uri = f"s3a://{bucket_name}/{object_key}"

# Read CSV data into a PySpark DataFrame
df = spark.read.csv(s3_uri, header=True, inferSchema=True)

# Count the total number of rows in the dataset
num_of_rows = df.count()

# Count the number of rows with null or empty cells
num_rows_with_null = df.rdd.filter(lambda row: any(cell is None or cell == '' for cell in row)).count()

# Count columns with null or empty cells
num_columns_with_null = sum([df.filter(col(col_name).isNull() | (col(col_name) == '')).count() > 0 for col_name in df.columns])

# Count the number of rows with all columns having values
num_complete_rows = df.dropna().count()

# Add a new column that counts non-null values in each row
non_null_cols = [col(col_name) for col_name in df.columns]
non_null_counts = [col(c).isNotNull().cast('integer').alias(f'{c}_non_null_count') for c in df.columns]
df = df.select(*df.columns, *non_null_counts)

# Add a new column that counts null values in each row
null_counts = [col(c).isNull().cast('integer').alias(f'{c}_null_count') for c in df.columns]
df = df.select(*df.columns, *null_counts)

# Add a new column that counts null values (empty cells) in each row
null_or_empty_counts = [((col(c).isNull() | col(c).contains('')).cast('integer')).alias(f'{c}_null_or_empty_count') for c in df.columns]
df = df.select(*df.columns, *null_or_empty_counts)

# Print results
print("Total number of records in dataset:", num_of_rows)
print("Number of rows with null or empty cells:", num_rows_with_null)
print("Number of columns with null or empty cells:", num_columns_with_null)
print("Number of rows with all columns having values:", num_complete_rows)





Total number of records in dataset: 995
Number of rows with null or empty cells: 1
Number of columns with null or empty cells: 1
Number of rows with all columns having values: 994
