**Pyspark Dataframe**

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySpark Dataframe').getOrCreate()

***Create DataFrame***

In [3]:
col = ["lang","count"]
data = [("C++", "20000"), ("C", "100000"), ("Java", "3000")]


In [4]:
rdd=spark.sparkContext.parallelize(data)

**Create DataFrame from RDD**

In [5]:
# Using toDF() function
df1= rdd.toDF(col)
df1.show()

+----+------+
|lang| count|
+----+------+
| C++| 20000|
|   C|100000|
|Java|  3000|
+----+------+



In [6]:
# Using createDataFrame() from SparkSession
df2= spark.createDataFrame(rdd).toDF(*col)
df2.show()

+----+------+
|lang| count|
+----+------+
| C++| 20000|
|   C|100000|
|Java|  3000|
+----+------+



**Create DataFrame from List Collection**

In [7]:
df3 = spark.createDataFrame(data).toDF(*col)
df3.show()

+----+------+
|lang| count|
+----+------+
| C++| 20000|
|   C|100000|
|Java|  3000|
+----+------+



In [8]:
# from csv files
df = spark.read.csv("/path/file.csv")
# from text file
df = spark.read.text("/path/file.txt")

**Convert PySpark DataFrame to Pandas**

In [9]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

col = ["first_name","middle_name","last_name","dob","gender","salary"]
pyspark_df = spark.createDataFrame(data = data, schema = col)
pyspark_df.show()


+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|     Maria|       Anne|    Jones|39192|     F|500000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



In [10]:
pandas_df = pyspark_df.toPandas()
print(pandas_df)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3      Maria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


**PySpark StructType & StructField**

In [11]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data,schema=schema)

df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [12]:
nested_data = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=nested_data,schema=structureSchema)
df2.show(truncate=False)


+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+

