In [0]:
# SINGLE COLUMN DATAFRAME FROM LIST

ages = [1,2,3]
spark.createDataFrame(ages, schema='int')  # You've got to use schema...

from pyspark.sql.types import IntegerType
spark.createDataFrame(ages, schema=IntegerType())

ages = [(13, ), (23, ), (45, ), (90, )]  # ...unless
spark.createDataFrame(ages)
spark.createDataFrame(ages, 'col_name int')

In [0]:
# MULTI COLUMN DATAFRAME FROM LIST

list_of_rows = [(1, "Kacper"), (2, "Michael"), (3, "Sophia")]
spark.createDataFrame(list_of_rows)
spark.createDataFrame(list_of_rows, 'id int, name string')

column_names = ['id', 'name']
spark.createDataFrame(list_of_rows, column_names)

In [0]:
# SPARK ROWS - OVERVIEW
list_of_lists = [[1, "Kacper"], [2, "Michael"], [3, "Sophia"]]
df = spark.createDataFrame(list_of_lists, 'id int, name string')
from pyspark.sql import Row

df.collect()  # Returns the dataframe as a list

Row(1, "Kacper")
row = Row(id=1, name="Kacper")
row.name
row["name"]

list_of_rows = [Row(*list_obj) for list_obj in list_of_lists]
spark.createDataFrame(list_of_rows, 'id int, name string').show()
# It is possible to convert all those containers into Row objects but spark doesn't really need it to create a DataFrame correctly.

In [0]:
# BASIC DATA TYPES

"""
Python      ->PySpark DataFrame

int         ->bigint (long)
string      ->string
boolean     ->boolean
float       ->double
date        ->date
datetime    ->timestamp
"""

In [0]:
# SPECIFYING SCHEMA
from pyspark.sql.types import *

df_schema_str = "id INT, first_name STRING"  # STRING
df_schema_list = ["id" , "first_name"]  # LIST
df_schema_spark_types = StructType([  # SPARK TYPES
    StructField('id', IntegerType()),
    StructField('first_name', StringType())
    ])

In [0]:
# SPARK DATAFRAME FROM PANDAS DATAFRAME

# 1. You can create a Spark DataFrame from a Pandas DataFrame:
spark_df = spark.createDataFrame(pd.DataFrame(users))
# 2. The biggest gain from using Pandas DataFrames is that it handles missing values:
spark_df = spark.createDataFrame([Row(**user) for user in users])  # Error
spark_df = spark.createDataFrame(pd.DataFrame(users))  # Works fine

In [0]:
# 3 container type columns:
# Arrays <-- list (index notation)
# Maps <-- dict (keyword notation)
# Structs <-- pySpark's Row (dot and keyword notation)

# col(container_type_column) - lets us access the elements of the container column
# col(XXX).alias('new_name') - SELECT XXX AS new_name
# explode(container_type_column) - lets us explode the rows with these columns into multiple rows