<a href="https://colab.research.google.com/github/Amt15/Pyspark/blob/main/DataFrame_Conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark -q
!pip install findspark -q

[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[K     |████████████████████████████████| 199 kB 54.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


## Converting spark dataframe into pandas and vice versa

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.Builder()\
  .appName("conversion of dataframe")\
  .master("local[*]")\
  .enableHiveSupport()\
  .getOrCreate()

In [4]:
data = [("1","ankit","Delhi-Delhi East"),\
        ("2","aryan","Bihar-Patna"),\
        ("3","mohit","Hyderabad-AmirPeth"),\
        ("4","alagumuthu","Maharashtra-Pune")]

sparkdf = spark.createDataFrame(data=data,schema=['id','name','location'])
sparkdf.show()

+---+----------+------------------+
| id|      name|          location|
+---+----------+------------------+
|  1|     ankit|  Delhi-Delhi East|
|  2|     aryan|       Bihar-Patna|
|  3|     mohit|Hyderabad-AmirPeth|
|  4|alagumuthu|  Maharashtra-Pune|
+---+----------+------------------+



In [5]:
sparkdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)



In [6]:
import pandas

pandadf = sparkdf.toPandas()
print(type(pandadf))

<class 'pandas.core.frame.DataFrame'>


In [7]:
pandadf.dtypes

id          object
name        object
location    object
dtype: object

In [8]:
pandadf['state'] = pandadf['location'].apply(lambda x: x.split("-")[0])
pandadf['city'] = pandadf['location'].apply(lambda x: x.split("-")[1])
pandadf

Unnamed: 0,id,name,location,state,city
0,1,ankit,Delhi-Delhi East,Delhi,Delhi East
1,2,aryan,Bihar-Patna,Bihar,Patna
2,3,mohit,Hyderabad-AmirPeth,Hyderabad,AmirPeth
3,4,alagumuthu,Maharashtra-Pune,Maharashtra,Pune


## converting back into spark dataframe

In [9]:
sparkDF=spark.createDataFrame(pandadf)
sparkDF.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)



In [10]:
from pyspark.sql.functions import col,split
sparkDF.withColumn("state",split(col("location"),"-")[0])\
       .withColumn("city",split(col("location"),"-")[1]).show()


+---+----------+------------------+-----------+----------+
| id|      name|          location|      state|      city|
+---+----------+------------------+-----------+----------+
|  1|     ankit|  Delhi-Delhi East|      Delhi|Delhi East|
|  2|     aryan|       Bihar-Patna|      Bihar|     Patna|
|  3|     mohit|Hyderabad-AmirPeth|  Hyderabad|  AmirPeth|
|  4|alagumuthu|  Maharashtra-Pune|Maharashtra|      Pune|
+---+----------+------------------+-----------+----------+



In [12]:
from pyspark.sql.functions import initcap

sparkDF.withColumn("name",initcap(col("name"))).show()

+---+----------+------------------+-----------+----------+
| id|      name|          location|      state|      city|
+---+----------+------------------+-----------+----------+
|  1|     Ankit|  Delhi-Delhi East|      Delhi|Delhi East|
|  2|     Aryan|       Bihar-Patna|      Bihar|     Patna|
|  3|     Mohit|Hyderabad-AmirPeth|  Hyderabad|  AmirPeth|
|  4|Alagumuthu|  Maharashtra-Pune|Maharashtra|      Pune|
+---+----------+------------------+-----------+----------+



In [15]:
from pyspark.sql.functions import concat,expr,substring,lower,upper
sparkDF.withColumn("nameNew",concat(upper(expr("substring(name,1,1)")),lower(expr("substring(name,2)")))).show()

+---+----------+------------------+-----------+----------+----------+
| id|      name|          location|      state|      city|   nameNew|
+---+----------+------------------+-----------+----------+----------+
|  1|     ankit|  Delhi-Delhi East|      Delhi|Delhi East|     Ankit|
|  2|     aryan|       Bihar-Patna|      Bihar|     Patna|     Aryan|
|  3|     mohit|Hyderabad-AmirPeth|  Hyderabad|  AmirPeth|     Mohit|
|  4|alagumuthu|  Maharashtra-Pune|Maharashtra|      Pune|Alagumuthu|
+---+----------+------------------+-----------+----------+----------+



In [18]:
from pyspark.sql.types import  StringType
from pyspark.sql.functions import udf

udf_capitalize = udf(lambda x: str(x).capitalize(),StringType())

sparkDF.withColumn("NewName",udf_capitalize('name')).show()


+---+----------+------------------+-----------+----------+----------+
| id|      name|          location|      state|      city|   NewName|
+---+----------+------------------+-----------+----------+----------+
|  1|     ankit|  Delhi-Delhi East|      Delhi|Delhi East|     Ankit|
|  2|     aryan|       Bihar-Patna|      Bihar|     Patna|     Aryan|
|  3|     mohit|Hyderabad-AmirPeth|  Hyderabad|  AmirPeth|     Mohit|
|  4|alagumuthu|  Maharashtra-Pune|Maharashtra|      Pune|Alagumuthu|
+---+----------+------------------+-----------+----------+----------+



In [20]:
udf_capitalize = udf(lambda x: x.capitalize())

sparkDF.withColumn("NewName",udf_capitalize('name')).show()


+---+----------+------------------+-----------+----------+----------+
| id|      name|          location|      state|      city|   NewName|
+---+----------+------------------+-----------+----------+----------+
|  1|     ankit|  Delhi-Delhi East|      Delhi|Delhi East|     Ankit|
|  2|     aryan|       Bihar-Patna|      Bihar|     Patna|     Aryan|
|  3|     mohit|Hyderabad-AmirPeth|  Hyderabad|  AmirPeth|     Mohit|
|  4|alagumuthu|  Maharashtra-Pune|Maharashtra|      Pune|Alagumuthu|
+---+----------+------------------+-----------+----------+----------+



In [30]:
s = "it is not easy to capitalize each word first alphabet of word is cap"
s.title()

'It Is Not Easy To Capitalize Each Word First Alphabet Of Word Is Cap'

In [34]:
import string
string.capwords("sachine")

'Sachine'

In [55]:
l = "hi hello how are you !!!"
lst = l.split()
[w[::-1] for w in lst]

['ih', 'olleh', 'woh', 'era', 'uoy', '!!!']

In [56]:
l = [("X", )]
df = spark.createDataFrame(l).toDF("dummy")
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [57]:
from pyspark.sql.functions import current_date, current_timestamp
df.withColumn("current_date",current_date()).show()

+-----+------------+
|dummy|current_date|
+-----+------------+
|    X|  2022-10-03|
+-----+------------+



In [58]:
df.select(current_date().alias("current_date")).show()

+------------+
|current_date|
+------------+
|  2022-10-03|
+------------+



In [60]:
df.select(current_timestamp().alias("current_date_time")).show(truncate=False)

+--------------------------+
|current_date_time         |
+--------------------------+
|2022-10-03 10:46:17.156636|
+--------------------------+



### We can convert a string which contain date or timestamp in non-standard format to standard date or time using to_date or to_timestamp function respectively.

In [61]:
from pyspark.sql.functions import lit, to_date, to_timestamp
df.select(to_date(lit('20210228'), 'yyyyMMdd').alias('to_date')).show()

+----------+
|   to_date|
+----------+
|2021-02-28|
+----------+



In [62]:
df.select(to_timestamp(lit('20210228 1725'), 'yyyyMMdd HHmm').alias('to_timestamp')).show()

+-------------------+
|       to_timestamp|
+-------------------+
|2021-02-28 17:25:00|
+-------------------+



In [63]:
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [66]:
datetimesDF = spark.createDataFrame(datetimes, schema="date STRING, time STRING")
datetimesDF.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



In [68]:
from pyspark.sql.functions import date_add, date_sub

datetimesDF. \
    withColumn("date_add_date", date_add("date", 10)). \
    withColumn("date_add_time", date_add("time", 10)). \
    withColumn("date_sub_date", date_sub("date", 10)). \
    withColumn("date_sub_time", date_sub("time", 10)). \
    show(truncate=False)

+----------+-----------------------+-------------+-------------+-------------+-------------+
|date      |time                   |date_add_date|date_add_time|date_sub_date|date_sub_time|
+----------+-----------------------+-------------+-------------+-------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-03-10   |2014-03-10   |2014-02-18   |2014-02-18   |
|2016-02-29|2016-02-29 08:08:08.999|2016-03-10   |2016-03-10   |2016-02-19   |2016-02-19   |
|2017-10-31|2017-12-31 11:59:59.123|2017-11-10   |2018-01-10   |2017-10-21   |2017-12-21   |
|2019-11-30|2019-08-31 00:00:00.000|2019-12-10   |2019-09-10   |2019-11-20   |2019-08-21   |
+----------+-----------------------+-------------+-------------+-------------+-------------+



In [70]:
from pyspark.sql.functions import spark_partition_id,input_file_name

# To find out number of elements in each partition
 
datetimesDF.withColumn("pid",spark_partition_id()).groupBy("pid").count().show()

+---+-----+
|pid|count|
+---+-----+
|  0|    2|
|  1|    2|
+---+-----+



In [71]:
datetimesDF.printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)



In [72]:
dd = df.select(current_date().alias("current_date"))

In [79]:
dd.printSchema()
print(type(dd))

root
 |-- current_date: date (nullable = false)

<class 'pyspark.sql.dataframe.DataFrame'>


In [78]:
from pyspark.sql.functions import to_timestamp,date_format

dd.withColumn("current_date",to_timestamp(col("current_date"))).withColumn("year",date_format(col("current_date"),"Y")).show()

TypeError: ignored