<h3 align="center"><b> PySpark SQL Test </b></h3>

---

- Taking a random dataset with ~800k rows
- Scaling it up to ~50m rows
- Spark SQL queries 
    - Value counts on each column (excl. key col)
    - Joining count columns into single table 
- Transferring from Spark DF to Pandas DF  

In [1]:
import re, pandas as pd, pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

print(pyspark.__version__)

3.2.2


In [2]:
conf = (SparkConf()
         .setMaster("local")
         .setAppName("dev")
         .set("spark.executor.memory", "8g")
         .set("spark.sql.shuffle.partitions", "1000")
         .set("spark.default.parallelism", "1000")
         )

sc = SparkContext(conf = conf)
spark = SparkSession(sparkContext=sc)

In [3]:
df = spark.read.csv('../data/random.csv', header=True, inferSchema=True)

print(f"Rows: {df.count()}")
print(f"Columns: {len(df.columns)}")
print("Schema:")
df.printSchema()

Rows: 819200
Columns: 6
Schema:
root
 |-- iter: integer (nullable = true)
 |-- t1: integer (nullable = true)
 |-- t2: integer (nullable = true)
 |-- t3: integer (nullable = true)
 |-- t4: integer (nullable = true)
 |-- t5: integer (nullable = true)



In [4]:
# Scale up DF
for _ in range(6):
    df = df.union(df)

In [5]:
use = (100 * df.sample(fraction=0.01).toPandas().memory_usage(deep=True)).sum() / 1024**2

print(f"Rows: {df.count()}")
print(f'Data frame memory : {round(use, 1)} MB')

Rows: 52428800
Data frame memory : 1196.7 MB


In [6]:
# Spark SQL aggregation
df.createOrReplaceTempView('table')
iter_cols = [i for i in df.columns if re.match('t', i)]

for col in iter_cols:
    temp_query = spark.sql(f'''
            SELECT {col} as value, COUNT({col}) as {col} 
            FROM table
            GROUP BY {col}; 
            ''')

    if col == iter_cols[0]:
        query = temp_query
    else: 
        query = query.join(temp_query, on='value', how='outer')

In [7]:
query.show(truncate=False)

+-----+------+------+------+------+------+
|value|t1    |t2    |t3    |t4    |t5    |
+-----+------+------+------+------+------+
|1    |532480|null  |null  |null  |null  |
|2    |495616|532480|null  |null  |null  |
|3    |479232|495616|532480|null  |null  |
|4    |565248|479232|495616|532480|null  |
|5    |520192|565248|479232|495616|532480|
|6    |479232|520192|565248|479232|495616|
|7    |606208|479232|520192|565248|479232|
|8    |516096|606208|479232|520192|565248|
|9    |491520|516096|606208|479232|520192|
|10   |573440|491520|516096|606208|479232|
|11   |520192|573440|491520|516096|606208|
|12   |532480|520192|573440|491520|516096|
|13   |499712|532480|520192|573440|491520|
|14   |589824|499712|532480|520192|573440|
|15   |507904|589824|499712|532480|520192|
|16   |466944|507904|589824|499712|532480|
|17   |540672|466944|507904|589824|499712|
|18   |544768|540672|466944|507904|589824|
|19   |466944|544768|540672|466944|507904|
|20   |544768|466944|544768|540672|466944|
+-----+----

In [8]:
spark.stop()