## 02-pyspark-rdd-to-dataframe.py

In [0]:
# 02-pyspark-rdd-to-dataframe.py
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()

dept = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
rdd = spark.sparkContext.parallelize(dept)
df = rdd.toDF()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
df.printSchema()
df.show()
df.show(truncate = True)
df.show(truncate = False)

DataFrame columns are: ['_1', '_2'] with column count: 2 and with row count: 4
root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+

+---------+---+
|_1       |_2 |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+



In [0]:
deptColumns = ['Department_Name', 'Department_ID']
df2 = rdd.toDF(deptColumns)
print("DataFrame columns are:", df2.columns, "with column count:", len(df2.columns), "and with row count:", df2.count())
df2.printSchema()
df2.show(truncate = False)

DataFrame columns are: ['Department_Name', 'Department_ID'] with column count: 2 and with row count: 4
root
 |-- Department_Name: string (nullable = true)
 |-- Department_ID: long (nullable = true)

+---------------+-------------+
|Department_Name|Department_ID|
+---------------+-------------+
|Finance        |10           |
|Marketing      |20           |
|Sales          |30           |
|IT             |40           |
+---------------+-------------+



In [0]:
deptDF = spark.createDataFrame(data = dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate = False)

root
 |-- Department_Name: string (nullable = true)
 |-- Department_ID: long (nullable = true)

+---------------+-------------+
|Department_Name|Department_ID|
+---------------+-------------+
|Finance        |10           |
|Marketing      |20           |
|Sales          |30           |
|IT             |40           |
+---------------+-------------+



In [0]:
from pyspark.sql.types import StructType, StructField, StringType
deptSchema = StructType([       
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)])

deptDF1 = spark.createDataFrame(data = dept, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate = False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

