## 12-python-pandas.py

In [0]:
# 12-python-pandas.py
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()

In [0]:
data = [("Bidhun", "Baran", "Biswas", "36636", "M", 60000),
        ("Ramesh", "Kumar", "Sing", "40288", "M", 70000),
        ("Ritesh", "Kumar", "Narayanan", "42114", "M", 400000),
        ("Mala", "Chandra", "Das", "39192", "F", 500000),
        ("Jayita", "Kumari", "Bera", "39100", "F", 700000)]

columns = ["first_name", "middle_name", "last_name", "id", "gender", "salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
print("DataFrame columns are:", pysparkDF.columns, "with column count:", len(pysparkDF.columns), "and with row count:", pysparkDF.count())
pysparkDF.printSchema()
pysparkDF.show(truncate = False)

DataFrame columns are: ['first_name', 'middle_name', 'last_name', 'id', 'gender', 'salary'] with column count: 6 and with row count: 5
root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|id   |gender|salary|
+----------+-----------+---------+-----+------+------+
|Bidhun    |Baran      |Biswas   |36636|M     |60000 |
|Ramesh    |Kumar      |Sing     |40288|M     |70000 |
|Ritesh    |Kumar      |Narayanan|42114|M     |400000|
|Mala      |Chandra    |Das      |39192|F     |500000|
|Jayita    |Kumari     |Bera     |39100|F     |700000|
+----------+-----------+---------+-----+------+------+



In [0]:
# Checking if a Column Exists in a DataFrame
print("Searching status-1:", "firstname".upper() in (name.upper() for name in pysparkDF.columns))
print("Searching status-2:", "first_name".upper() in (name.upper() for name in pysparkDF.columns))

Searching status-1: False
Searching status-2: True


In [0]:
pandasDF = pysparkDF.toPandas()
print(type(pandasDF))
print(pandasDF)
pandasDF

<class 'pandas.core.frame.DataFrame'>
  first_name middle_name  last_name     id gender  salary
0     Bidhun       Baran     Biswas  36636      M   60000
1     Ramesh       Kumar       Sing  40288      M   70000
2     Ritesh       Kumar  Narayanan  42114      M  400000
3       Mala     Chandra        Das  39192      F  500000
4     Jayita      Kumari       Bera  39100      F  700000


Unnamed: 0,first_name,middle_name,last_name,id,gender,salary
0,Bidhun,Baran,Biswas,36636,M,60000
1,Ramesh,Kumar,Sing,40288,M,70000
2,Ritesh,Kumar,Narayanan,42114,M,400000
3,Mala,Chandra,Das,39192,F,500000
4,Jayita,Kumari,Bera,39100,F,700000


In [0]:
pandasDF = pysparkDF.limit(2).toPandas()
print(pandasDF)
pandasDF


  first_name middle_name last_name     id gender  salary
0     Bidhun       Baran    Biswas  36636      M   60000
1     Ramesh       Kumar      Sing  40288      M   70000


Unnamed: 0,first_name,middle_name,last_name,id,gender,salary
0,Bidhun,Baran,Biswas,36636,M,60000
1,Ramesh,Kumar,Sing,40288,M,70000


In [0]:
# Nested structure elements
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
dataStruct = [(("Amit", "Kumar", "Sen"), "36636", "M", "3000"), \
              (("Mithun", "Dutta", "Jana"), "40288", "M", "4000"), \
              (("Rehan", "Guha", "Noyogi"), "42114", "M", "4000"), \
              (("Malati", "Sen", "Gupta"), "39192", "F", "4000"), \
              (("Jaya", "Chandra", "Singh"), "39130", "F", "5000")]

schemaStruct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
        StructField('id', StringType(), True),
        StructField('gender', StringType(), True),
        StructField('salary', StringType(), True)
        ])

In [0]:
df = spark.createDataFrame(data = dataStruct, schema = schemaStruct)
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|  {Amit, Kumar, Sen}|36636|     M|  3000|
|{Mithun, Dutta, J...|40288|     M|  4000|
|{Rehan, Guha, Noy...|42114|     M|  4000|
|{Malati, Sen, Gupta}|39192|     F|  4000|
|{Jaya, Chandra, S...|39130|     F|  5000|
+--------------------+-----+------+------+



In [0]:
# Default - displays 20 rows and 20 charactes from column value 
df.show()

# Display full column contents
df.show(truncate = False)

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|  {Amit, Kumar, Sen}|36636|     M|  3000|
|{Mithun, Dutta, J...|40288|     M|  4000|
|{Rehan, Guha, Noy...|42114|     M|  4000|
|{Malati, Sen, Gupta}|39192|     F|  4000|
|{Jaya, Chandra, S...|39130|     F|  5000|
+--------------------+-----+------+------+

+----------------------+-----+------+------+
|name                  |id   |gender|salary|
+----------------------+-----+------+------+
|{Amit, Kumar, Sen}    |36636|M     |3000  |
|{Mithun, Dutta, Jana} |40288|M     |4000  |
|{Rehan, Guha, Noyogi} |42114|M     |4000  |
|{Malati, Sen, Gupta}  |39192|F     |4000  |
|{Jaya, Chandra, Singh}|39130|F     |5000  |
+----------------------+-----+------+------+



In [0]:
# Display 2 rows and full column contents
df.show(2, truncate = False) 

# Display 2 rows & column values 25 characters
df.show(2, truncate = 25) 

+---------------------+-----+------+------+
|name                 |id   |gender|salary|
+---------------------+-----+------+------+
|{Amit, Kumar, Sen}   |36636|M     |3000  |
|{Mithun, Dutta, Jana}|40288|M     |4000  |
+---------------------+-----+------+------+
only showing top 2 rows

+---------------------+-----+------+------+
|                 name|   id|gender|salary|
+---------------------+-----+------+------+
|   {Amit, Kumar, Sen}|36636|     M|  3000|
|{Mithun, Dutta, Jana}|40288|     M|  4000|
+---------------------+-----+------+------+
only showing top 2 rows



In [0]:
# Display DataFrame rows & columns vertically
df.show(n = 3, truncate = 25, vertical = True)

-RECORD 0-----------------------
 name   | {Amit, Kumar, Sen}    
 id     | 36636                 
 gender | M                     
 salary | 3000                  
-RECORD 1-----------------------
 name   | {Mithun, Dutta, Jana} 
 id     | 40288                 
 gender | M                     
 salary | 4000                  
-RECORD 2-----------------------
 name   | {Rehan, Guha, Noyogi} 
 id     | 42114                 
 gender | M                     
 salary | 4000                  
only showing top 3 rows



In [0]:
pandasDF2 = df.toPandas()
print(pandasDF2)
print(type(pandasDF2))
pandasDF2

                                                name     id gender salary
0  {'firstname': 'Amit', 'middlename': 'Kumar', '...  36636      M   3000
1  {'firstname': 'Mithun', 'middlename': 'Dutta',...  40288      M   4000
2  {'firstname': 'Rehan', 'middlename': 'Guha', '...  42114      M   4000
3  {'firstname': 'Malati', 'middlename': 'Sen', '...  39192      F   4000
4  {'firstname': 'Jaya', 'middlename': 'Chandra',...  39130      F   5000
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name,id,gender,salary
0,"{'firstname': 'Amit', 'middlename': 'Kumar', '...",36636,M,3000
1,"{'firstname': 'Mithun', 'middlename': 'Dutta',...",40288,M,4000
2,"{'firstname': 'Rehan', 'middlename': 'Guha', '...",42114,M,4000
3,"{'firstname': 'Malati', 'middlename': 'Sen', '...",39192,F,4000
4,"{'firstname': 'Jaya', 'middlename': 'Chandra',...",39130,F,5000


In [0]:
structureData = [(("Amit", "Kumar", "Sen"), "36636", "M", 3000), \
                 (("Mithun", "Dutta", "Jana"), "40288", "M", 4000), \
                 (("Rehan", "Guha", "Noyogi"), "42114", "M", 4000), \
                 (("Malati", "Sen", "Gupta"), "39192", "F", 4000), \
                 (("Jaya", "Chandra", "Singh"), "39130", "F", 5000)]
                 
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

In [0]:
df2 = spark.createDataFrame(data = structureData, schema = structureSchema)
print("DataFrame columns are:", df2.columns, "with column count:", len(df2.columns), "and with row count:", df2.count())
df2.printSchema()
df2.show(truncate = False)

DataFrame columns are: ['name', 'id', 'gender', 'salary'] with column count: 4 and with row count: 5
root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+----------------------+-----+------+------+
|name                  |id   |gender|salary|
+----------------------+-----+------+------+
|{Amit, Kumar, Sen}    |36636|M     |3000  |
|{Mithun, Dutta, Jana} |40288|M     |4000  |
|{Rehan, Guha, Noyogi} |42114|M     |4000  |
|{Malati, Sen, Gupta}  |39192|F     |4000  |
|{Jaya, Chandra, Singh}|39130|F     |5000  |
+----------------------+-----+------+------+



In [0]:
from pyspark.sql.functions import col, struct, when
updatedDF = df2.withColumn("OtherInfo",
    struct(col("id").alias("identifier"),
           col("gender").alias("gender"),
           col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000, "Low")
      .when(col("salary").cast(IntegerType()) < 4000, "Medium")
      .otherwise("High").alias("Salary_Grade")
    )).drop("id", "gender", "salary")

updatedDF.printSchema()
updatedDF.show(truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+----------------------+------------------------+
|name                  |OtherInfo               |
+----------------------+------------------------+
|{Amit, Kumar, Sen}    |{36636, M, 3000, Medium}|
|{Mithun, Dutta, Jana} |{40288, M, 4000, High}  |
|{Rehan, Guha, Noyogi} |{42114, M, 4000, High}  |
|{Malati, Sen, Gupta}  |{39192, F, 4000, High}  |
|{Jaya, Chandra, Singh}|{39130, F, 5000, High}  |
+----------------------+------------------------+

