In [3]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .master("local[3]") \
      .appName("Alex_Spark_POC") \
      .config("spark.executor.memory", "512m") \
      .getOrCreate() 

In [5]:
# Get a Spark Config
partions = spark.conf.get("spark.sql.shuffle.partitions")
print(partions)

memory = spark.conf.get("spark.executor.memory")
print(memory)

200
512m


In [2]:
# Create RDD from parallelize    
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)

LanguageColumns = ["Language","Language_id"]
rddDF = rdd.toDF(LanguageColumns)
rddDF.printSchema()
rddDF.show()

                                                                                

root
 |-- Language: string (nullable = true)
 |-- Language_id: long (nullable = true)

+--------+-----------+
|Language|Language_id|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [3]:
# Create DataFrame from Data and schema
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)

In [4]:
# Print Schema
df.printSchema()

# Show Data
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



                                                                                

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [5]:
# Convert SparkDF to PandasDF
pandasDF = df.toPandas()
print(pandasDF)

  firstname middlename  lastname         dob gender  salary
0     James                Smith  1991-04-01      M    3000
1   Michael       Rose            2000-05-19      M    4000
2    Robert             Williams  1978-09-05      M    4000
3     Maria       Anne     Jones  1967-12-01      F    4000
4       Jen       Mary     Brown  1980-02-17      F      -1


In [8]:
# Create temp view for group functions
df.createOrReplaceTempView("PERSON_DATA")
groupDF = spark.sql("SELECT gender, count(*) from PERSON_DATA group by gender")
groupDF.show()

groupDF2 = spark.sql("SELECT gender, min(dob) as earliest_dob, max(salary) max_salary, \
                             count(*) person_counts \
                     from PERSON_DATA \
                     group by gender")
groupDF2.show()

+------+--------+
|gender|count(1)|
+------+--------+
|     M|       3|
|     F|       2|
+------+--------+

+------+------------+----------+-------------+
|gender|earliest_dob|max_salary|person_counts|
+------+------------+----------+-------------+
|     F|  1967-12-01|      4000|            2|
|     M|  1978-09-05|      4000|            3|
+------+------------+----------+-------------+



In [9]:
# Window Function
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec  = Window.partitionBy("gender").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)) \
    .show(truncate=False)

+---------+----------+--------+----------+------+------+----------+
|firstname|middlename|lastname|dob       |gender|salary|row_number|
+---------+----------+--------+----------+------+------+----------+
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |1         |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |2         |
|James    |          |Smith   |1991-04-01|M     |3000  |1         |
|Michael  |Rose      |        |2000-05-19|M     |4000  |2         |
|Robert   |          |Williams|1978-09-05|M     |4000  |3         |
+---------+----------+--------+----------+------+------+----------+

