<a href="https://colab.research.google.com/github/Amt15/Pyspark/blob/main/pysparkBuiltInFunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark -q
!pip install findspark -q


[K     |████████████████████████████████| 281.3 MB 55 kB/s 
[K     |████████████████████████████████| 199 kB 87.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.7/dist-packages/pyspark'

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Built in function pract")\
        .master("local[*]")\
        .enableHiveSupport()\
        .getOrCreate()

In [4]:
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+



In [5]:
from pyspark.sql.functions import when,lit,col
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
                                 .when(df.gender == "F","Female")
                                 .when(df.gender.isNull() ,"")
                                 .otherwise(df.gender))

df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [6]:
df2.drop(df.gender).show()

+-------+------+----------+
|   name|salary|new_gender|
+-------+------+----------+
|  James| 60000|      Male|
|Michael| 70000|      Male|
| Robert|400000|          |
|  Maria|500000|    Female|
|    Jen|  null|          |
+-------+------+----------+



In [7]:
df2.withColumn("chandan", lit("married")).show()

+-------+------+------+----------+-------+
|   name|gender|salary|new_gender|chandan|
+-------+------+------+----------+-------+
|  James|     M| 60000|      Male|married|
|Michael|     M| 70000|      Male|married|
| Robert|  null|400000|          |married|
|  Maria|     F|500000|    Female|married|
|    Jen|      |  null|          |married|
+-------+------+------+----------+-------+



In [8]:
df.select(col("*"),when(df.gender == "M","Male")
                  .when(df.gender == "F","Female")
                  .when(df.gender.isNull() ,"")
                  .otherwise(df.gender).alias("new_gender")).show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [9]:
from pyspark.sql import functions as f
df3 = df.withColumn("new_gender", f.expr("CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
               "ELSE gender END"))
df3.show(truncate=False)


+-------+------+------+----------+
|name   |gender|salary|new_gender|
+-------+------+------+----------+
|James  |M     |60000 |Male      |
|Michael|M     |70000 |Male      |
|Robert |null  |400000|          |
|Maria  |F     |500000|Female    |
|Jen    |      |null  |          |
+-------+------+------+----------+



In [10]:
df.select(col("*"), f.expr("CASE WHEN gender = 'M' THEN 'Male' " +
           "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
           "ELSE gender END").alias("new_gender")).show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [11]:
df2.drop(df.gender).withColumnRenamed("new_gender","Gender").show()

+-------+------+------+
|   name|salary|Gender|
+-------+------+------+
|  James| 60000|  Male|
|Michael| 70000|  Male|
| Robert|400000|      |
|  Maria|500000|Female|
|    Jen|  null|      |
+-------+------+------+



In [12]:
df2.createOrReplaceTempView("EMP")

In [13]:
spark.sql("select * from EMP").show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [14]:
data = [(1,"20200828"),(2,"20180525")]
columns=["id","date"]
df=spark.createDataFrame(data,columns)
df.withColumn('year', f.substring('date', 1,4))\
    .withColumn('month', f.substring('date', 5,2))\
    .withColumn('day', f.substring('date', 7,2)).show()
    
df.printSchema()
df.show(truncate=False)

+---+--------+----+-----+---+
| id|    date|year|month|day|
+---+--------+----+-----+---+
|  1|20200828|2020|   08| 28|
|  2|20180525|2018|   05| 25|
+---+--------+----+-----+---+

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)

+---+--------+
|id |date    |
+---+--------+
|1  |20200828|
|2  |20180525|
+---+--------+



In [15]:
df.selectExpr('date', 'substring(date, 1,4) as year', \
                  'substring(date, 5,2) as month', \
                 'substring(date, 7,2) as day').show()

+--------+----+-----+---+
|    date|year|month|day|
+--------+----+-----+---+
|20200828|2020|   08| 28|
|20180525|2018|   05| 25|
+--------+----+-----+---+



In [16]:
df.withColumn('year', col('date').substr(1, 4))\
  .withColumn('month',col('date').substr(5, 2))\
  .withColumn('day', col('date').substr(7, 2)).show()

+---+--------+----+-----+---+
| id|    date|year|month|day|
+---+--------+----+-----+---+
|  1|20200828|2020|   08| 28|
|  2|20180525|2018|   05| 25|
+---+--------+----+-----+---+



In [18]:

df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [19]:

address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
df.show()

+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



In [20]:
df.withColumn('address', f.regexp_replace('address', 'Rd', 'Road'))\
  .withColumn('address',f.regexp_replace('address', 'St', 'Street'))\
  .withColumn('address',f.regexp_replace('address','Ave','Avenue'))\
  .show(truncate=False)

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+



In [21]:
df.withColumn('address', 
    when(df.address.endswith('Rd'),f.regexp_replace(df.address,'Rd','Road')) \
   .when(df.address.endswith('St'),f.regexp_replace(df.address,'St','Street')) \
   .when(df.address.endswith('Ave'),f.regexp_replace(df.address,'Ave','Avenue')) \
   .otherwise(df.address)) \
   .show(truncate=False)

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+



In [22]:
# pyspark.sql.functions.split(str, pattern, limit=-1)

data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.printSchema()


root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [23]:
df2 = df.select(f.split(col("name"),",").alias("NameArray")) \
    .drop("name")
df2.printSchema()
df2.show()

root
 |-- NameArray: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K, Willi...|
|[Maria, Anne, Jones]|
|  [Jen, Mary, Brown]|
+--------------------+



In [24]:
df.createOrReplaceTempView("PERSON")
spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
    .show()

+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K, Willi...|
|[Maria, Anne, Jones]|
|  [Jen, Mary, Brown]|
+--------------------+



Use " to_timestamp() " function to convert String to Timestamp (TimestampType) in PySpark. The converted time would be in a default format of MM-dd-yyyy HH:mm:ss.SSS

- Syntax: to_timestamp(timestampString:Column) 
- Syntax: to_timestamp(timestampString:Column,format:String) 
 

In [28]:
df=spark.createDataFrame(
        data = [ ("1","2019-06-24 12:01:19.000")],
        schema=["id","input_timestamp"])
df.printSchema()

#Timestamp String to DateType
df2=df.withColumn("timestamp",f.to_timestamp("input_timestamp"))
df2.show(truncate=False)  

root
 |-- id: string (nullable = true)
 |-- input_timestamp: string (nullable = true)

+---+-----------------------+-------------------+
|id |input_timestamp        |timestamp          |
+---+-----------------------+-------------------+
|1  |2019-06-24 12:01:19.000|2019-06-24 12:01:19|
+---+-----------------------+-------------------+



In [29]:
# Using Cast to convert TimestampType to DateType
df2.withColumn('timestamp_string', \
         f.to_timestamp('timestamp').cast('string')) \
  .show(truncate=False)

+---+-----------------------+-------------------+-------------------+
|id |input_timestamp        |timestamp          |timestamp_string   |
+---+-----------------------+-------------------+-------------------+
|1  |2019-06-24 12:01:19.000|2019-06-24 12:01:19|2019-06-24 12:01:19|
+---+-----------------------+-------------------+-------------------+



In [32]:
df.select(f.to_timestamp(lit('06-24-2019 12:01:19.000'),'MM-dd-yyyy HH:mm:ss.SSSS')) \
  .show(truncate=False)

+---------------------------------------------------------------+
|to_timestamp(06-24-2019 12:01:19.000, MM-dd-yyyy HH:mm:ss.SSSS)|
+---------------------------------------------------------------+
|2019-06-24 12:01:19                                            |
+---------------------------------------------------------------+



In [36]:
#SQL string to TimestampType
spark.sql("select to_timestamp('2019-06-24 12:01:19.000') as timestamp").show()
#SQL CAST timestamp string to TimestampType
spark.sql("select timestamp('2019-06-24 12:01:19.000') as timestamp").show()
#SQL Custom string to TimestampType
spark.sql("select to_timestamp('06-24-2019 12:01:19.000','MM-dd-yyyy HH:mm:ss.SSSS') as timestamp").show()

+-------------------+
|          timestamp|
+-------------------+
|2019-06-24 12:01:19|
+-------------------+

+-------------------+
|          timestamp|
+-------------------+
|2019-06-24 12:01:19|
+-------------------+

+-------------------+
|          timestamp|
+-------------------+
|2019-06-24 12:01:19|
+-------------------+



# PySpark functions provide to_date() function to convert timestamp to date (DateType), this ideally achieved by just truncating the time part from the Timestamp column.


- Syntax: to_date(timestamp_column)
- Syntax: to_date(timestamp_column,format)

In [39]:
#Timestamp String to DateType
df.withColumn("date_type",f.to_date("input_timestamp")) \
  .show(truncate=False)

#Timestamp Type to DateType
df.withColumn("date_type",f.to_date(f.current_timestamp())) \
  .show(truncate=False) 

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2022-07-10|
+---+-----------------------+----------+



In [41]:
#Custom Timestamp format to DateType
df.select(f.to_date(f.lit('06-24-2019 12:01:19.000'),'MM-dd-yyyy HH:mm:ss.SSSS')) \
  .show()

+----------------------------------------------------------+
|to_date(06-24-2019 12:01:19.000, MM-dd-yyyy HH:mm:ss.SSSS)|
+----------------------------------------------------------+
|                                                2019-06-24|
+----------------------------------------------------------+



In [43]:
#Timestamp type to DateType
df.withColumn("ts",f.to_timestamp(f.col("input_timestamp"))) \
  .withColumn("datetype",f.to_date(f.col("ts"))) \
  .show(truncate=False)

+---+-----------------------+-------------------+----------+
|id |input_timestamp        |ts                 |datetype  |
+---+-----------------------+-------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24 12:01:19|2019-06-24|
+---+-----------------------+-------------------+----------+



In [45]:
# Using Cast to convert Timestamp String to DateType
df.withColumn('date_type', f.col('input_timestamp').cast('date')) \
       .show(truncate=False)

# Using Cast to convert TimestampType to DateType
df.withColumn('date_type', f.to_timestamp('input_timestamp').cast('date')) \
  .show(truncate=False)

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+



In [46]:
df=spark.createDataFrame([["1"]],["id"])
df.select(f.current_date().alias("current_date"), \
      f.date_format(f.current_timestamp(),"yyyy MM dd").alias("yyyy MM dd"), \
      f.date_format(f.current_timestamp(),"MM/dd/yyyy hh:mm").alias("MM/dd/yyyy"), \
      f.date_format(f.current_timestamp(),"yyyy MMM dd").alias("yyyy MMMM dd"), \
      f.date_format(f.current_timestamp(),"yyyy MMMM dd E").alias("yyyy MMMM dd E") \
   ).show()

+------------+----------+----------------+------------+----------------+
|current_date|yyyy MM dd|      MM/dd/yyyy|yyyy MMMM dd|  yyyy MMMM dd E|
+------------+----------+----------------+------------+----------------+
|  2022-07-10|2022 07 10|07/10/2022 11:31| 2022 Jul 10|2022 July 10 Sun|
+------------+----------+----------------+------------+----------------+



+------------+----------+----------------+------------+-------------------+
|current_date|yyyy_MM_dd|      MM_dd_yyyy|yyyy_MMMM_dd|     yyyy_MMMM_dd_E|
+------------+----------+----------------+------------+-------------------+
|  2022-07-10|2022 07 10|07/10/2022 11:32| 2022 Jul 10|2022 July 10 Sunday|
+------------+----------+----------------+------------+-------------------+



In [52]:

#SQL
spark.sql("select current_date() as current_date, "+
      "date_format(current_timestamp(),'yyyy MM dd') as yyyy_MM_dd, "+
      "date_format(current_timestamp(),'MM/dd/yyyy hh:mm') as MM_dd_yyyy, "+
      "date_format(current_timestamp(),'yyyy MMM dd') as yyyy_MMMM_dd, "+
      "date_format(current_timestamp(),'yyyy MMMM dd E') as yyyy_MMMM_dd_E,"+
      "date_format(current_timestamp(),'yyyy MMMM dd EEEE') as yyyy_MMMM_dd_E").show()


+------------+----------+----------------+------------+----------------+-------------------+
|current_date|yyyy_MM_dd|      MM_dd_yyyy|yyyy_MMMM_dd|  yyyy_MMMM_dd_E|     yyyy_MMMM_dd_E|
+------------+----------+----------------+------------+----------------+-------------------+
|  2022-07-10|2022 07 10|07/10/2022 11:35| 2022 Jul 10|2022 July 10 Sun|2022 July 10 Sunday|
+------------+----------+----------------+------------+----------------+-------------------+



In [54]:
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])

df.select(
      col("date"),
      f.current_date().alias("current_date"),
      f.datediff(f.current_date(),col("date")).alias("datediff")
    ).show()

+----------+------------+--------+
|      date|current_date|datediff|
+----------+------------+--------+
|2019-07-01|  2022-07-10|    1105|
|2019-06-24|  2022-07-10|    1112|
|2019-08-24|  2022-07-10|    1051|
+----------+------------+--------+



In [55]:
df.withColumn("datesDiff", f.datediff(f.current_date(),col("date"))) \
  .withColumn("montsDiff", f.months_between(f.current_date(),col("date"))) \
  .withColumn("montsDiff_round",f.round(f.months_between(f.current_date(),col("date")),2)) \
  .withColumn("yearsDiff",f.months_between(f.current_date(),col("date"))/lit(12)) \
  .withColumn("yearsDiff_round",f.round(f.months_between(f.current_date(),col("date"))/lit(12),2)) \
  .show()

+---+----------+---------+-----------+---------------+------------------+---------------+
| id|      date|datesDiff|  montsDiff|montsDiff_round|         yearsDiff|yearsDiff_round|
+---+----------+---------+-----------+---------------+------------------+---------------+
|  1|2019-07-01|     1105|36.29032258|          36.29|3.0241935483333333|           3.02|
|  2|2019-06-24|     1112| 36.5483871|          36.55|       3.045698925|           3.05|
|  3|2019-08-24|     1051| 34.5483871|          34.55|2.8790322583333334|           2.88|
+---+----------+---------+-----------+---------------+------------------+---------------+



In [57]:
data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]  
df2=spark.createDataFrame(data=data2,schema=["id","date"])
df2.select(
    f.to_date(col("date"),"MM-dd-yyyy").alias("date"),
    f.current_date().alias("endDate")
    ).show()

+----------+----------+
|      date|   endDate|
+----------+----------+
|2019-07-01|2022-07-10|
|2019-06-24|2022-07-10|
|2019-08-24|2022-07-10|
+----------+----------+



# explode() and flatten()

In [61]:
arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



In [62]:
df.select(df.name,f.explode(df.subjects)).show(truncate=False)

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



In [65]:
df.select(df.name,f.flatten(df.subjects)).show(truncate=False)

+-------+-------------------------------+
|name   |flatten(subjects)              |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+

