In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, explode

# Now you can use these imported elements
spark = SparkSession.builder.appName("MyPySparkApp").getOrCreate()
# schema = StructType([
#     StructField("name", StringType(), True),
#     StructField("age", IntegerType(), True)
# ])

# #### merge 2 DFs



simpleData = [(1,"Sagar", "CSE", "UP",80,), \
(2, "Shivam", "IT", "MP", 86,),
(3, "Muni", "Mech", "AP", 70,)\
]
columns= ["ID", "Student_Name", "Department_Name", "City", "Marks"]
df1 = spark.createDataFrame(data = simpleData, schema = columns)
df1.show()


simpleData_2 = [(5, "Raj", "CSE", "HP"), \
(7, "Kunal", "Mech", "Rajasthan") \
]
columns_2= ["ID", "Student_Name", "Department_Name","City"]
df2 = spark.createDataFrame(data = simpleData_2, schema = columns_2)

df2.show()

udf=df1.unionByName(df2,allowMissingColumns=True)
udf.show()

+---+------------+---------------+----+-----+
| ID|Student_Name|Department_Name|City|Marks|
+---+------------+---------------+----+-----+
|  1|       Sagar|            CSE|  UP|   80|
|  2|      Shivam|             IT|  MP|   86|
|  3|        Muni|           Mech|  AP|   70|
+---+------------+---------------+----+-----+

+---+------------+---------------+---------+
| ID|Student_Name|Department_Name|     City|
+---+------------+---------------+---------+
|  5|         Raj|            CSE|       HP|
|  7|       Kunal|           Mech|Rajasthan|
+---+------------+---------------+---------+

+---+------------+---------------+---------+-----+
| ID|Student_Name|Department_Name|     City|Marks|
+---+------------+---------------+---------+-----+
|  1|       Sagar|            CSE|       UP|   80|
|  2|      Shivam|             IT|       MP|   86|
|  3|        Muni|           Mech|       AP|   70|
|  5|         Raj|            CSE|       HP| NULL|
|  7|       Kunal|           Mech|Rajasthan| NULL

In [15]:
# #### use of explode function


simpleData = [ (1, ["Sagar", "Prajapati"]), (2,["Shivam", "Gupta"]), (3,["Kunal", "Verma"]), (4, ["Kim"])]
columns=["ID", "Name"]

sdf=spark.createDataFrame(simpleData,schema=columns)

sdf.show()

ndf=sdf.withColumn("new_name",explode(col("Name")))
ndf.select("id","new_name").show()

+---+------------------+
| ID|              Name|
+---+------------------+
|  1|[Sagar, Prajapati]|
|  2|   [Shivam, Gupta]|
|  3|    [Kunal, Verma]|
|  4|             [Kim]|
+---+------------------+

+---+---------+
| id| new_name|
+---+---------+
|  1|    Sagar|
|  1|Prajapati|
|  2|   Shivam|
|  2|    Gupta|
|  3|    Kunal|
|  3|    Verma|
|  4|      Kim|
+---+---------+



In [28]:
# find valid mobile numbers

from pyspark.sql.functions import regexp_replace

dataset1=[(2,'shivam','u9886755443244B'),(3,'abhi','u886755kp244B'),(4,'shwetha','u7886755gt244B'),(5,'jack','7886755244')]
columns=["id","name",'mobile']

mobileDF=spark.createDataFrame(dataset1,schema=columns)
mobileDF.show()
mobileDF.withColumn("nmobile",regexp_replace(col("mobile"),"[^0-9]","")).show()

mobileDF.filter(col("mobile").rlike("^[0-9]*$")).show()


+---+-------+---------------+
| id|   name|         mobile|
+---+-------+---------------+
|  2| shivam|u9886755443244B|
|  3|   abhi|  u886755kp244B|
|  4|shwetha| u7886755gt244B|
|  5|   jack|     7886755244|
+---+-------+---------------+

+---+-------+---------------+-------------+
| id|   name|         mobile|      nmobile|
+---+-------+---------------+-------------+
|  2| shivam|u9886755443244B|9886755443244|
|  3|   abhi|  u886755kp244B|    886755244|
|  4|shwetha| u7886755gt244B|   7886755244|
|  5|   jack|     7886755244|   7886755244|
+---+-------+---------------+-------------+

+---+----+----------+
| id|name|    mobile|
+---+----+----------+
|  5|jack|7886755244|
+---+----+----------+

