In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ReadParquetExample") \
    .getOrCreate()


In [4]:
df = spark.read.parquet("parquet/20250323/part-00000-70719350-f19b-4f49-9c97-36dcad7803bd-c000.snappy.parquet")

In [6]:
df.printSchema()   # shows column names and types
# Show 5 rows, do not truncate any column, and print each row vertically
df.show(5, truncate=False, vertical=True)


root
 |-- SOURCEURL: string (nullable = true)
 |-- first(SQLDATE): string (nullable = true)
 |-- EventCount: long (nullable = true)
 |-- GLOBALEVENTIDs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- GoldsteinMean: double (nullable = true)
 |-- GoldsteinMedian: double (nullable = true)
 |-- MedianPGoldstein: double (nullable = true)
 |-- ToneMean: double (nullable = true)
 |-- MedianAvgTone: double (nullable = true)
 |-- EventCodes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Cameos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- All_descriptions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ActorPairs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Actor1Name: string (nullable = true)
 |    |    |-- Actor2Name: string (nullable = true)
 |-- GeoPairs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 

In [8]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, sum, when

# Build a combined predicate: any column IS NULL
any_null = None
for c in df.columns:
    is_null_c = col(c).isNull()
    any_null = is_null_c if any_null is None else any_null | is_null_c

# Count rows matching it
rows_with_nulls = df.filter(any_null).count()
print(f"Rows with at least one null: {rows_with_nulls}")



Rows with at least one null: 0


In [10]:
df.filter(any_null).limit(10).show(truncate=False, vertical=True)


(0 rows)



In [11]:
# 1) Number of rows
row_count = df.count()
print(f"Total rows: {row_count}")

# 2) Number of columns
num_columns = len(df.columns)
print(f"Total columns: {num_columns}")


Total rows: 14168
Total columns: 14


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, countDistinct, desc

cols = df.columns

summary_rows = []
for c in cols:
    non_null = df.filter(col(c).isNotNull()).count()
    distinct = df.select(countDistinct(c)).first()[0]
    top = (
        df.groupBy(c)
          .count()
          .orderBy(desc("count"))
          .limit(1)
          .collect()
    )
    if top:
        top_val, top_freq = top[0][c], top[0]["count"]
        # **coerce to string** so every top_val is a Python str
        top_val = str(top_val)
    else:
        top_val, top_freq = None, 0

    summary_rows.append((c, non_null, distinct, top_val, top_freq))

# Now Spark can infer a consistent schema: all top_val fields are strings
summary_df = spark.createDataFrame(
    summary_rows,
    schema=["column", "non_null", "distinct", "top_value", "top_freq"]
)
summary_df.show(truncate=False)


+----------------+--------+--------+----------------------------------------------------------------------------+--------+
|column          |non_null|distinct|top_value                                                                   |top_freq|
+----------------+--------+--------+----------------------------------------------------------------------------+--------+
|SOURCEURL       |14168   |14168   |http://www.darientimes.com/opinion/article/tong-ui-risk-utility-20236533.php|1       |
|first(SQLDATE)  |14168   |6       |20250323                                                                    |13536   |
|EventCount      |14168   |69      |1                                                                           |3650    |
|GLOBALEVENTIDs  |14168   |14168   |['1233752629', '1233752640', '1233752641']                                  |1       |
|GoldsteinMean   |14168   |3508    |0.0                                                                         |747     |
|GoldsteinMedian

In [19]:
from pyspark.sql.functions import col

# Rename the column so it’s easier to refer to:
unique_dates_df = df.select(col("first(SQLDATE)").alias("SQLDATE")).distinct().orderBy("SQLDATE")       # optional: sort them chronologically

unique_dates_df.show(truncate=False)


+--------+
|SQLDATE |
+--------+
|20150326|
|20240323|
|20250221|
|20250316|
|20250322|
|20250323|
+--------+



In [20]:
from pyspark.sql.functions import regexp_extract, col

# This regex grabs the part between “://” (optional) and the next “/” (or end-of-string).
df2 = df.withColumn(
    "domain",
    regexp_extract(
        col("SOURCEURL"),
        r'^(?:https?://)?(?:www\.)?([^/]+)',
        1
    )
)

df2.select("SOURCEURL", "domain").show(truncate=False)


+----------------------------------------------------------------------------------------------------------------+-----------------------+
|SOURCEURL                                                                                                       |domain                 |
+----------------------------------------------------------------------------------------------------------------+-----------------------+
|http://baystreet.ca/articles/economiccommentary/109783/Canadian-PM-Mark-Carney-Expected-To-Call-Snap-Election   |baystreet.ca           |
|http://businessghana.com/site/news/general/324856/WaterAid-demands-action-to-tackle-crisis                      |businessghana.com      |
|http://businessghana.com/site/news/general/324857/Inculcate-Japanese-edu-style-in-schools                       |businessghana.com      |
|http://businessghana.com/site/news/general/324858/ECOWAS-Court-Conducts-Preparatory-Meeting                     |businessghana.com      |
|http://colombogazette.com/

In [21]:
df2.printSchema()   # shows column names and types
# Show 5 rows, do not truncate any column, and print each row vertically
df2.show(5, truncate=False, vertical=True)

root
 |-- SOURCEURL: string (nullable = true)
 |-- first(SQLDATE): string (nullable = true)
 |-- EventCount: long (nullable = true)
 |-- GLOBALEVENTIDs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- GoldsteinMean: double (nullable = true)
 |-- GoldsteinMedian: double (nullable = true)
 |-- MedianPGoldstein: double (nullable = true)
 |-- ToneMean: double (nullable = true)
 |-- MedianAvgTone: double (nullable = true)
 |-- EventCodes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Cameos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- All_descriptions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ActorPairs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Actor1Name: string (nullable = true)
 |    |    |-- Actor2Name: string (nullable = true)
 |-- GeoPairs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 

In [28]:
from pyspark.sql.functions import count

# 1) Cache and trigger an action
df2.cache()
df2.count()               # materializes the entire DF in cache

# 2) Now overwrite the same location
df2.write.mode("overwrite") \
   .parquet("parquet/20250323")


Py4JJavaError: An error occurred while calling o851.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 368.0 failed 1 times, most recent failure: Lost task 0.0 in stage 368.0 (TID 233) (np-light-indigo-populus-db8d798cf-sbsv5 executor driver): org.apache.spark.SparkFileNotFoundException: File file:/home/jovyan/shared/Taher/parquet/20250323/part-00000-70719350-f19b-4f49-9c97-36dcad7803bd-c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:780)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:288)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkFileNotFoundException: File file:/home/jovyan/shared/Taher/parquet/20250323/part-00000-70719350-f19b-4f49-9c97-36dcad7803bd-c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:780)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:288)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)
