<a href="https://colab.research.google.com/github/Amt15/Pyspark/blob/main/pysparkBuiltInFunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark -q
!pip install findspark -q


[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[K     |████████████████████████████████| 199 kB 52.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.7/dist-packages/pyspark'

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Built in function pract")\
        .master("local[*]")\
        .enableHiveSupport()\
        .getOrCreate()

In [4]:
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+



In [13]:
from pyspark.sql.functions import when,lit,col
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
                                 .when(df.gender == "F","Female")
                                 .when(df.gender.isNull() ,"")
                                 .otherwise(df.gender))

df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [14]:
df2.drop(df.gender).show()

+-------+------+----------+
|   name|salary|new_gender|
+-------+------+----------+
|  James| 60000|      Male|
|Michael| 70000|      Male|
| Robert|400000|          |
|  Maria|500000|    Female|
|    Jen|  null|          |
+-------+------+----------+



In [15]:
df2.withColumn("chandan", lit("married")).show()

+-------+------+------+----------+-------+
|   name|gender|salary|new_gender|chandan|
+-------+------+------+----------+-------+
|  James|     M| 60000|      Male|married|
|Michael|     M| 70000|      Male|married|
| Robert|  null|400000|          |married|
|  Maria|     F|500000|    Female|married|
|    Jen|      |  null|          |married|
+-------+------+------+----------+-------+



In [16]:
df.select(col("*"),when(df.gender == "M","Male")
                  .when(df.gender == "F","Female")
                  .when(df.gender.isNull() ,"")
                  .otherwise(df.gender).alias("new_gender")).show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



+-------+
|   name|
+-------+
|  James|
|Michael|
| Robert|
|  Maria|
|    Jen|
+-------+



In [23]:
from pyspark.sql import functions as f
df3 = df.withColumn("new_gender", f.expr("CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
               "ELSE gender END"))
df3.show(truncate=False)


+-------+------+------+----------+
|name   |gender|salary|new_gender|
+-------+------+------+----------+
|James  |M     |60000 |Male      |
|Michael|M     |70000 |Male      |
|Robert |null  |400000|          |
|Maria  |F     |500000|Female    |
|Jen    |      |null  |          |
+-------+------+------+----------+



In [26]:
df.select(col("*"), f.expr("CASE WHEN gender = 'M' THEN 'Male' " +
           "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
           "ELSE gender END").alias("new_gender")).show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [30]:
df2.drop(df.gender).withColumnRenamed("new_gender","Gender").show()

+-------+------+------+
|   name|salary|Gender|
+-------+------+------+
|  James| 60000|  Male|
|Michael| 70000|  Male|
| Robert|400000|      |
|  Maria|500000|Female|
|    Jen|  null|      |
+-------+------+------+



In [33]:
df2.createOrReplaceTempView("EMP")

In [34]:
spark.sql("select * from EMP").show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [40]:
data = [(1,"20200828"),(2,"20180525")]
columns=["id","date"]
df=spark.createDataFrame(data,columns)
df.withColumn('year', f.substring('date', 1,4))\
    .withColumn('month', f.substring('date', 5,2))\
    .withColumn('day', f.substring('date', 7,2)).show()
    
df.printSchema()
df.show(truncate=False)

+---+--------+----+-----+---+
| id|    date|year|month|day|
+---+--------+----+-----+---+
|  1|20200828|2020|   08| 28|
|  2|20180525|2018|   05| 25|
+---+--------+----+-----+---+

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)

+---+--------+
|id |date    |
+---+--------+
|1  |20200828|
|2  |20180525|
+---+--------+



In [45]:
df.selectExpr('date', 'substring(date, 1,4) as year', \
                  'substring(date, 5,2) as month', \
                 'substring(date, 7,2) as day').show()

+--------+----+-----+---+
|    date|year|month|day|
+--------+----+-----+---+
|20200828|2020|   08| 28|
|20180525|2018|   05| 25|
+--------+----+-----+---+



In [46]:
df.withColumn('year', col('date').substr(1, 4))\
  .withColumn('month',col('date').substr(5, 2))\
  .withColumn('day', col('date').substr(7, 2)).show()

+---+--------+----+-----+---+
| id|    date|year|month|day|
+---+--------+----+-----+---+
|  1|20200828|2020|   08| 28|
|  2|20180525|2018|   05| 25|
+---+--------+----+-----+---+



In [59]:
df.selectExpr('substr("date",1,4) as year').show()

+----+
|year|
+----+
|date|
|date|
+----+



In [63]:

df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+



In [72]:

address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
df.show()

+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



In [85]:
df.withColumn('address', f.regexp_replace('address', 'Rd', 'Road'))\
  .withColumn('address',f.regexp_replace('address', 'St', 'Street'))\
  .withColumn('address',f.regexp_replace('address','Ave','Avenue'))\
  .show(truncate=False)

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+



In [86]:
df.withColumn('address', 
    when(df.address.endswith('Rd'),f.regexp_replace(df.address,'Rd','Road')) \
   .when(df.address.endswith('St'),f.regexp_replace(df.address,'St','Street')) \
   .when(df.address.endswith('Ave'),f.regexp_replace(df.address,'Ave','Avenue')) \
   .otherwise(df.address)) \
   .show(truncate=False)

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+

