In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [5]:
from pyspark.sql import Row
from datetime import date, datetime

# Row
df = spark.createDataFrame([
    Row(a=1, b=2, c='string1', d=date(2023,3,3), e=datetime(2024,3,3,1,1,1)),
    Row(a=2, b=3, c='string2', d=date(2023,3,4), e=datetime(2024,3,3,1,1,1)),
    Row(a=3, b=4, c='string3', d=date(2023,3,5), e=datetime(2024,3,3,1,1,1))
])

df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|
|  3|  4|string3|2023-03-05|2024-03-03 01:01:01|
+---+---+-------+----------+-------------------+



In [7]:
df = spark.createDataFrame([
    (1, 2, 'string1', date(2023,3,3), datetime(2024,3,3,1,1,1)),
    (2, 3, 'string2', date(2023,3,4), datetime(2024,3,3,1,1,1)),
    (3, 4, 'string3', date(2023,3,5), datetime(2024,3,3,1,1,1))
], schema='a long, b long, c string, d date, e timestamp')

df

DataFrame[a: bigint, b: bigint, c: string, d: date, e: timestamp]

In [8]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|
|  3|  4|string3|2023-03-05|2024-03-03 01:01:01|
+---+---+-------+----------+-------------------+



In [9]:
import pandas as pd

pandas_df = pd.DataFrame({
    "a": [1,2,3],
    "b":[2,3,4],
    "c":['string1', 'string2', 'string3'],
    "d": [date(2023,3,3), date(2023,3,4), date(2023,3,5)],
    "e": [datetime(2024,3,3,1,1,1), datetime(2024,3,3,1,1,1), datetime(2024,3,3,1,1,1)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: bigint, c: string, d: date, e: timestamp]

In [10]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|
|  3|  4|string3|2023-03-05|2024-03-03 01:01:01|
+---+---+-------+----------+-------------------+



In [11]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [12]:
df.show(1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|
+---+---+-------+----------+-------------------+
only showing top 1 row



In [13]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2                   
 c   | string1             
 d   | 2023-03-03          
 e   | 2024-03-03 01:01:01 
only showing top 1 row



In [14]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [15]:
df.select("a", "b", "c").describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   NULL|
| stddev|1.0|1.0|   NULL|
|    min|  1|  2|string1|
|    max|  3|  4|string3|
+-------+---+---+-------+



In [16]:
pandas_df = df.toPandas()
pandas_df

Unnamed: 0,a,b,c,d,e
0,1,2,string1,2023-03-03,2024-03-03 01:01:01
1,2,3,string2,2023-03-04,2024-03-03 01:01:01
2,3,4,string3,2023-03-05,2024-03-03 01:01:01


In [17]:
df.a

Column<'a'>

In [18]:
df['a']

Column<'a'>

In [22]:
df.select(df.a).show()

+---+
|  a|
+---+
|  1|
|  2|
|  3|
+---+



In [23]:
from pyspark.sql.functions import upper

df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|STRING1|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|STRING2|
|  3|  4|string3|2023-03-05|2024-03-03 01:01:01|STRING3|
+---+---+-------+----------+-------------------+-------+



In [25]:
df.filter(df.a<=2).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|
+---+---+-------+----------+-------------------+



In [27]:
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def plus_one(series: pd.Series) -> pd.Series:
    return series+1

df.withColumn('a_plus', plus_one(df.a)).show()

+---+---+-------+----------+-------------------+------+
|  a|  b|      c|         d|                  e|a_plus|
+---+---+-------+----------+-------------------+------+
|  1|  2|string1|2023-03-03|2024-03-03 01:01:01|     2|
|  2|  3|string2|2023-03-04|2024-03-03 01:01:01|     3|
|  3|  4|string3|2023-03-05|2024-03-03 01:01:01|     4|
+---+---+-------+----------+-------------------+------+

