In [23]:
# Importing necessary libraries
from pyspark.sql import SparkSession
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

# Creation Session
spark = SparkSession.builder.master("local[1]") \
    .appName("SparkByExamples.com") \
    .getOrCreate()

In [24]:
spark

# DataFrame Creation

In [25]:
# PySpark DataFrame from a list of rows
data = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
data

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [26]:
# PySpark DataFrame with an explicit schema.
data = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
data

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [27]:
# PySpark DataFrame from a pandas DataFrame
pandas_data = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
data = spark.createDataFrame(pandas_data)
data

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [28]:
# All DataFrames above result same.
data.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)

