In [1]:
!pip install findspark
!pip install pyspark
import findspark
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate() 
findspark.init()
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)



In [2]:
df = sqlContext.createDataFrame( [("00100","0010"),("003000","002"),("05000","00200"),("asma","Rehman"),("zara","Rehman"),("saif","Rehman"),("4","1"),("7","8")], 
                                 ["a", "b"] )
df.show()

+------+------+
|     a|     b|
+------+------+
| 00100|  0010|
|003000|   002|
| 05000| 00200|
|  asma|Rehman|
|  zara|Rehman|
|  saif|Rehman|
|     4|     1|
|     7|     8|
+------+------+



In [3]:
df = df.withColumn('row_num',row_number().over(Window.partitionBy().orderBy("a")))
df.show()

+------+------+-------+
|     a|     b|row_num|
+------+------+-------+
| 00100|  0010|      1|
|003000|   002|      2|
| 05000| 00200|      3|
|     4|     1|      4|
|     7|     8|      5|
|  asma|Rehman|      6|
|  saif|Rehman|      7|
|  zara|Rehman|      8|
+------+------+-------+



In [4]:
df_int = df.where(col('row_num').between(1,5))
df_int.show()

+------+-----+-------+
|     a|    b|row_num|
+------+-----+-------+
| 00100| 0010|      1|
|003000|  002|      2|
| 05000|00200|      3|
|     4|    1|      4|
|     7|    8|      5|
+------+-----+-------+



In [5]:
# Remove leading zero of column
df_int= df_int.withColumn('a_remove leading zero', regexp_replace('a', '^0*', '').cast(IntegerType()))\
.withColumn('b_remove_leading_zero',regexp_replace('b','^0*',"").cast(IntegerType()))

df_int.show(truncate = False)

+------+-----+-------+---------------------+---------------------+
|a     |b    |row_num|a_remove leading zero|b_remove_leading_zero|
+------+-----+-------+---------------------+---------------------+
|00100 |0010 |1      |100                  |10                   |
|003000|002  |2      |3000                 |2                    |
|05000 |00200|3      |5000                 |200                  |
|4     |1    |4      |4                    |1                    |
|7     |8    |5      |7                    |8                    |
+------+-----+-------+---------------------+---------------------+



In [6]:
df_int.printSchema()

root
 |-- a: string (nullable = true)
 |-- b: string (nullable = true)
 |-- row_num: integer (nullable = true)
 |-- a_remove leading zero: integer (nullable = true)
 |-- b_remove_leading_zero: integer (nullable = true)



In [7]:
df_string = df.where(col('row_num').between(6,df.count()))

In [8]:
df_string=df_string.drop('row_num')

In [9]:
df_string.show()

+----+------+
|   a|     b|
+----+------+
|asma|Rehman|
|saif|Rehman|
|zara|Rehman|
+----+------+



In [10]:
# left pad of the column
df_string = df_string.withColumn('a_leftpad', lpad(df_string.a,8,'#'))
df_string.show()

+----+------+---------+
|   a|     b|a_leftpad|
+----+------+---------+
|asma|Rehman| ####asma|
|saif|Rehman| ####saif|
|zara|Rehman| ####zara|
+----+------+---------+



In [11]:
# right pad of the column
df_string = df_string.withColumn('a_rightpad', rpad(df_string.a,8,'#'))
df_string.show()

+----+------+---------+----------+
|   a|     b|a_leftpad|a_rightpad|
+----+------+---------+----------+
|asma|Rehman| ####asma|  asma####|
|saif|Rehman| ####saif|  saif####|
|zara|Rehman| ####zara|  zara####|
+----+------+---------+----------+



In [12]:
# left pad of the column
df_string = df_string.withColumn('a_leftpad', lpad(df_string.a,8,' '))
df_string.show(truncate = False)

+----+------+---------+----------+
|a   |b     |a_leftpad|a_rightpad|
+----+------+---------+----------+
|asma|Rehman|    asma |asma####  |
|saif|Rehman|    saif |saif####  |
|zara|Rehman|    zara |zara####  |
+----+------+---------+----------+



In [13]:
df_string = df_string.withColumn('a_rightpad', rpad(df_string.a,8,' '))
df_string.show(truncate = False)

+----+------+---------+----------+
|a   |b     |a_leftpad|a_rightpad|
+----+------+---------+----------+
|asma|Rehman|    asma |asma      |
|saif|Rehman|    saif |saif      |
|zara|Rehman|    zara |zara      |
+----+------+---------+----------+



In [14]:
# Remove leading and trailing space
df_string = df_string.withColumn('a_trim', trim(df_string.a))
df_string.show(truncate = False)

+----+------+---------+----------+------+
|a   |b     |a_leftpad|a_rightpad|a_trim|
+----+------+---------+----------+------+
|asma|Rehman|    asma |asma      |asma  |
|saif|Rehman|    saif |saif      |saif  |
|zara|Rehman|    zara |zara      |zara  |
+----+------+---------+----------+------+



In [15]:
df_names = sqlContext.createDataFrame( [("asma rehman",),("zara rehman",)], ["names"] )

In [16]:
df_names.show()

+-----------+
|      names|
+-----------+
|asma rehman|
|zara rehman|
+-----------+



In [17]:
# String Split of the column
df_names = df_names.withColumn('first_name', split(col('names')," ").getItem(0)).withColumn('last_name',split(col('names')," ").getItem(1))
df_names.show()

+-----------+----------+---------+
|      names|first_name|last_name|
+-----------+----------+---------+
|asma rehman|      asma|   rehman|
|zara rehman|      zara|   rehman|
+-----------+----------+---------+



In [18]:
# repeat the column
df_names.withColumn('Repeat',expr("repeat(first_name,3)")).show()

+-----------+----------+---------+------------+
|      names|first_name|last_name|      Repeat|
+-----------+----------+---------+------------+
|asma rehman|      asma|   rehman|asmaasmaasma|
|zara rehman|      zara|   rehman|zarazarazara|
+-----------+----------+---------+------------+



In [19]:
# substring
df_names.withColumn("substring_name",df_names.names.substr(1,4)).show()
df_names.withColumn("substring_name_last",df_names.names.substr(-4,4)).show()

+-----------+----------+---------+--------------+
|      names|first_name|last_name|substring_name|
+-----------+----------+---------+--------------+
|asma rehman|      asma|   rehman|          asma|
|zara rehman|      zara|   rehman|          zara|
+-----------+----------+---------+--------------+

+-----------+----------+---------+-------------------+
|      names|first_name|last_name|substring_name_last|
+-----------+----------+---------+-------------------+
|asma rehman|      asma|   rehman|               hman|
|zara rehman|      zara|   rehman|               hman|
+-----------+----------+---------+-------------------+



In [20]:
df_names.withColumn('length_names',length("names")).show()

+-----------+----------+---------+------------+
|      names|first_name|last_name|length_names|
+-----------+----------+---------+------------+
|asma rehman|      asma|   rehman|          11|
|zara rehman|      zara|   rehman|          11|
+-----------+----------+---------+------------+



In [21]:
df_date = sqlContext.createDataFrame( [("asma rehman","17-01-1996"),("zara rehman","28-10-1990")], ["names","birthday"] )
df_date.show()
df_date.select('birthday').dtypes

+-----------+----------+
|      names|  birthday|
+-----------+----------+
|asma rehman|17-01-1996|
|zara rehman|28-10-1990|
+-----------+----------+



[('birthday', 'string')]

In [22]:
# Convert string to date
df_date = df_date.withColumn('birthday',to_date(df_date.birthday,"dd-MM-yyyy"))
df_date.select('birthday').dtypes

[('birthday', 'date')]

In [23]:
# convert date to string
df_date =df_date.withColumn('birthday',df_date.birthday.cast(StringType()))
df_date.select('birthday').dtypes

[('birthday', 'string')]

In [26]:
# convert column yo upper case
df_names.select("*",upper(col('first_name'))).show(truncate = False)

+-----------+----------+---------+-----------------+
|names      |first_name|last_name|upper(first_name)|
+-----------+----------+---------+-----------------+
|asma rehman|asma      |rehman   |ASMA             |
|zara rehman|zara      |rehman   |ZARA             |
+-----------+----------+---------+-----------------+



In [27]:
# title or proper case
df_names.select("*",initcap(col('names'))).show()

+-----------+----------+---------+--------------+
|      names|first_name|last_name|initcap(names)|
+-----------+----------+---------+--------------+
|asma rehman|      asma|   rehman|   Asma Rehman|
|zara rehman|      zara|   rehman|   Zara Rehman|
+-----------+----------+---------+--------------+

