<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/1e/UNAL_Logosimbolo.svg/583px-UNAL_Logosimbolo.svg.png" alt="" width="1280" height="300" /></p>



# DATA TYPES AND FUNCTIONS

In PySpark, some functions such as `.isNull()` or` .substr()` are methods available directly on columns `(col("name").substr(1,3))`, while many other operations such as `upper()`, `lower()`, `length()`,` when()`, `concat()`, and similar are not methods of the column object, but functions of the pyspark.sql.functions module.

This is because:

Some complex operations (such as conditional expressions, regex replacements, concatenations, etc.) require logic that is not encapsulated in the column object.

The functions in pyspark.sql.functions are optimized and allow you to build complex SQL expressions in a clear and flexible way.

**There are approximately 300 functions under `pyspark.sql.functions`. At a higher level they can be grouped into a few categories.**


In [0]:
from pyspark.sql.functions import (
    col, upper, lower, length, ltrim, rtrim, trim, initcap, reverse,
    concat, concat_ws, regexp_replace, regexp_extract, instr, sha2, md5,
    when, format_number, format_string, round, bround, abs, log10, exp,
    sqrt, pow, cast, current_date, current_timestamp, date_format, year,
    month, dayofmonth, dayofweek, dayofyear, weekofyear, date_add,
    date_sub, datediff, monotonically_increasing_id, rand, randn, lit,
    row_number, substring, split, lpad, rpad, count, countDistinct, sum,
    avg, min, max, last_day, trunc, date_trunc, months_between, add_months, next_day,
    unix_timestamp, from_unixtime, to_utc_timestamp, coalesce, isnan, isnull,
    from_utc_timestamp, to_utc_timestamp, first, last, variance, stddev, approx_count_distinct,
    collect_list, collect_set
)

# import pyspark.sql.functions as fc

In [0]:
elements = [
    {"id": 1, "name": "July", "age": 34, "salary": 550, "role": "admin"},
    {"id": 1, "name": "July", "age": 34, "salary": None, "role": "admin"},
    {"id": 2, "name": "Gabriel", "age": 29, "salary": 720, "role": "developer"},
    {"id": 3, "name": "Luis", "age": 42, "salary": 610, "role": "developer"},
    {"id": 4, "name": "John", "age": 51, "salary": 890, "role": "manager"},
    {"id": 5, "name": "Daniel", "age": 27, "salary": 480, "role": "developer"},
]

df = spark.createDataFrame(elements)
df.display()

## TRANSVERSAL

### FIRST

In [0]:
df.select(first("name")).display()

### LAST

In [0]:
df.select(last("name")).display()

### ISNAN

In [0]:
df.select(isnan(col("salary"))).display()

### ISNULL

In [0]:
df.select(isnull(col("salary"))).display()

### LIT

In [0]:
df.select(
    col("name"),
    col("salary"),
    lit("training").alias("company"),
    lit(1000).alias("bonus")
).display()

### COALESE

In [0]:
df.select(coalesce(col("salary"), lit(-999))).display()

### WHEN

##### SIMPLE

In [0]:
df.select(when(col("salary") > 300, "High").otherwise("Low")).display()

##### MULTI

In [0]:
df.select(
    col("role"),
    when(col("role") == "admin", "Administration")\
        .when(col("role") == "developer", "Engineering")\
        .when(col("role") == "tester", "Quality Assurance")\
        .otherwise("Other")\
        .alias("role_type")
).display()

### MONOTONICALLY INCREASING ID
generates a unique ID per row that increases monotonically, but is not guaranteed to be sequential or contiguous.

In [0]:
df.select(monotonically_increasing_id()).display()

### ROW NUMBER

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.orderBy("name")  # o cualquier otra columna
df_with_key = df.withColumn("surrogate_key", row_number().over(window_spec))
df_with_key.display()

## STRINGS

### UPPER

In [0]:
df.select(upper(col("name"))).display()

### LOWER

In [0]:
df.select(lower(col("name"))).display()

### LENGTH

In [0]:
df.select(length(col("name"))).display()

### SUSTRING

In [0]:
df.select(substring(col("name"), 1, 3).alias("substring_name")).display()

### SPLIT

In [0]:
df.select(split(col("name"), "a").alias("split_name")).display()

### RPAD

In [0]:
df.select(lpad(col("name"), 10, "*").alias("lpad_name")).display()

### LPAD

In [0]:
df.select(rpad(col("name"), 10, "*").alias("rpad_name")).display()

### LTRIM

In [0]:
df.select(ltrim(col("name"))).display()

### RTRIM

In [0]:
df.select(rtrim(col("name"))).display()

### TRIM

In [0]:
df.select(trim(col("name"))).display()

### INICAP

In [0]:
df.select(initcap(col("name"))).display()

### REVERSE

In [0]:
df.select(reverse(col("name"))).display()

### CONCAT

In [0]:
df.select(concat(col("name"), col("role"))).display()

In [0]:
df.select(concat(col("name"), lit("@"), col("role"))).display()

### CONCAT_WS

In [0]:
df.select(concat_ws(" @ ", col("name"), col("role"))).display()

### REGEXP_REPLACE

In [0]:
df.select(regexp_replace(col("name"), "a", "@")).display()

### REGEXP_EXTRACT

In [0]:
df.select(regexp_extract(col("name"), "(\\w+)", 1)).display()


### INSTR
return index based on value

In [0]:
df.select(instr(col("name"), "a")).display()

### SHA2

In [0]:
df.select(sha2(col("name"), 256)).display()

### MD5

In [0]:
df.select(md5(col("name"))).display()

### FORMAT STRING

In [0]:
df.select(format_string("Salary: %d", col("salary"))).display()

## NUMBERS

### COUNT

In [0]:
df.select(count(col("salary"))).display()

### COUNT DISTINCT

In [0]:
df.select(countDistinct(col("salary"))).display()

### SUM

In [0]:
df.select(sum(col("salary"))).display()

### AVG

In [0]:
df.select(avg(col("salary"))).display()

### MIN

In [0]:
df.select(min(col("salary"))).display()

### MAX

In [0]:
df.select(max(col("salary"))).display()

### FORMAT NUMBER

In [0]:
df.select(format_number(col("salary"), 2)).display()

### ROUND

In [0]:
df.select(round(col("salary"), 0)).display()

### BROUND
round numeric values

In [0]:
df.select(bround(col("salary"), 2)).display()

### ABS

In [0]:
df.select(abs(col("salary"))).display()

### LOG10

calculates the base 10 logarithm of the numeric value in a column.

In [0]:
df.select(log10(col("salary"))).display()

### EXP
calculates the exponential value (base e) of a numeric column.

In [0]:
df.select(exp(col("salary"))).display()

### SQRT

In [0]:
df.select(sqrt(col("salary"))).display()

### POW

In [0]:
df.select(pow(col("salary"), 2)).display()

### RAND
Generates random numbers between 0 and 1 with a uniform distribution..

In [0]:
df.select(rand()).display()

### RANDN
generates random numbers with a normal (Gaussian) distribution, with a mean of 0 and a standard deviation of 1.

In [0]:
df.select(randn()).display()

### APPROX COUNT DISTINCT

In [0]:
df.select(approx_count_distinct(col("salary"))).display()

### VARIANCE

The **variance** measures **how much** the values ​​are spread out from the mean.
Formula:

$$
{Variance} = \frac{1}{n - 1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$

This is called the **sample variance**, which is what PySpark calculates by default.

In [0]:
df.select(variance(col("salary"))).display()

### STDDEV
Indicates how much the values deviate from the mean on average.
It is the square root of the sample variance:
$$
s = \sqrt{ \frac{1}{n - 1} \sum_{i=1}^{n} (x_i - \bar{x})^2 }
$$

In [0]:
df.select(stddev(col("salary"))).display()

## DATES

### CURRENT DATE

In [0]:
df.select(current_date()).display()

### CURRENT TIMESTAMP

In [0]:
df.select(current_timestamp()).display()

### UNIX TIMESTAMP
is a number that represents time as the number of seconds (or milliseconds) that have passed since January 1, 1970 (known as the "Unix epoch").

In [0]:
df.select(current_timestamp(), unix_timestamp(current_timestamp())).display()

### FROM UNIX TIME

convert the date to normal

In [0]:
df.select(from_unixtime(lit(1746023247))).display()

### FROM UTC TIMESTAMP

In [0]:
df.select(
    current_timestamp(), 
    from_utc_timestamp(current_timestamp(), "America/Bogota")).display()

### TO UTC TIMESTAMP

In [0]:
df.select(
    current_timestamp(), 
    to_utc_timestamp(lit('2025-04-30T09:31:57.970+00:00'), "America/Bogota")).display()

### DATE FORMAT

In [0]:
df.select(date_format(current_date(), "dd/MM/yyyy")).display()

### YEAR

In [0]:
df.select(year(current_date())).display()

### MONTH

In [0]:
df.select(month(current_date())).display()

### DAY OF MONTH

In [0]:
df.select(dayofmonth(current_date())).display()

### DAY OF WEEK

In [0]:
df.select(dayofweek(current_date())).display()

### DAY OF YEAR

In [0]:
df.select(dayofyear(current_date())).display()

### DAY NAME

In [0]:
df.select(
    date_format(current_date(), "EEEE").alias("day_name")
).display()

### WEEK OF YEAR

In [0]:
df.select(weekofyear(current_date())).display()

### DATE ADD

In [0]:
df.select(date_add(current_date(), 7)).display()

### DATE SUB

In [0]:
df.select(date_sub(current_date(), 7)).display()

### DATE DIFF

In [0]:
df.select(datediff(current_date(),date_sub(current_date(), 7))).display()

### LAST DAY

In [0]:
df.select(last_day(current_date()).alias("last_day_of_month")).display()

### TRUNC

Truncate the date to a specified unit of time


#### YEAR

`'YEAR', 'YYYY', 'YY'` - truncate to the first date of the year that the date falls in.

In [0]:
df.select(trunc(current_date(), "YEAR").alias("first_day_of_month")).display()

#### QUARTER

truncate to the first date of the quarter that the date falls in.

In [0]:
df.select(trunc(current_date(), "QUARTER").alias("first_day_of_month")).display()

#### MONTH

`'MONTH', 'MM', 'MON'` - truncate to the first date of the month that the date falls in.


In [0]:
df.select(trunc(current_date(), "MONTH").alias("first_day_of_month")).display()

#### WEEK

'WEEK' - truncate to the Monday of the week that the date falls in.

In [0]:
df.select(trunc(current_date(), "WEEK").alias("first_day_of_month")).display()

### DATE TRUNC



#### YEAR

`'YEAR', 'YYYY', 'YY'` - truncate to the first date of the year that the date falls in.

In [0]:
df.select(date_trunc("YEAR", current_timestamp()).alias("first_day_month_ts")).display()

#### QUARTER

'QUARTER' - truncate to the first date of the quarter that the date falls in.

In [0]:
df.select(date_trunc("QUARTER", current_timestamp())).display()

#### MONTH

'MONTH', 'MM', 'MON' - truncate to the first date of the month that the date falls in.


In [0]:

df.select(date_trunc("MONTH", current_timestamp())).display()

#### WEEK

'WEEK' - truncate to the Monday of the week that the date falls in.

In [0]:
df.select(date_trunc("WEEK", current_timestamp())).display()

### MONTS BETWEEN

In [0]:
df.select(months_between(current_date(), lit("2024-01-01").cast("date")).alias("months_between")).display()

### ADD MONTHS

In [0]:
df.select(add_months(current_date(), 2).alias("plus_2_months")).display()

### ADD YEARS

In [0]:
df.select(add_months(current_date(), (12 * 3)).alias("plus_2_months")).display()

###  NEXT DAY

In [0]:
df.select(next_day(current_date(), "Sunday").alias("next_sunday")).display()

## COMPLEX TYPES

In [0]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, MapType
from pyspark.sql.functions import col, size, array_contains, explode, posexplode, sort_array, reverse, array_distinct, array_position, array_remove, array_repeat, arrays_overlap, concat, slice, element_at

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("letters", ArrayType(StringType()), True),
    StructField("metadata", StructType([
        StructField("version", IntegerType(), True),
        StructField("tags", ArrayType(StringType()), True)
    ])),
    StructField("log", ArrayType(StructType([
        StructField("user_id", IntegerType(), True),
        StructField("action", StringType(), True)
    ]))),
    StructField("info", MapType(StringType(), StringType()), True)
])

data = [
    {
        "id": 1,
        "letters": ["a", "e", "i", "o", "u"],
        "metadata": {"version": 1, "tags": ["spark", "databricks", "certification"]},
        "log": [
            {"user_id": 1, "action": "login"},
            {"user_id": 2, "action": "logout"}
        ],
        "info": {"name": "Bob", "city": "Paris", "country": "France"}
    },
    {
        "id": 2,
        "letters": ["b", "c", "d"],
        "metadata": {"version": 2, "tags": ["python", "etl"]},
        "log": [
            {"user_id": 3, "action": "login"},
            {"user_id": 4, "action": "update"}
        ],
        "info": {"name": "Alice", "city": "New York", "country": "USA"}
    },
    {
        "id": 3,
        "letters": ["x", "y", "z"],
        "metadata": {"version": 3, "tags": ["dataframe", "json", "pyspark"]},
        "log": [
            {"user_id": 5, "action": "delete"},
            {"user_id": 6, "action": "logout"}
        ],
        "info": {"name": "Carlos", "city": "Bogotá", "country": "Colombia"}
    },
    {
        "id": 4,
        "letters": ["l", "m", "n", "ñ"],
        "metadata": {"version": 4, "tags": ["analysis", "pipeline"]},
        "log": [
            {"user_id": 7, "action": "insert"},
            {"user_id": 8, "action": "login"}
        ],
        "info": {"name": "Lucho", "city": "Quebeq", "country": "Canada"}
    }
]

dfc = spark.createDataFrame(data,schema=schema)
dfc.printSchema()

In [0]:
dfc.display()

### LISTS

#### GET VALUE

In [0]:
dfc.select(
    col("letters"),
    col("letters")[0],
    dfc.letters[0],
    "letters"
).display()

#### SIZE

In [0]:
dfc.select(size(col("letters")).alias("size_letters")).display()

#### ARRAY CONTAINS

In [0]:
dfc.select(array_contains(col("letters"), "e").alias("contains_e")).display()


#### EXPLODE

In [0]:
dfc.select(explode(col("letters")).alias("explode_letters")).display()


In [0]:
dfc.select(col("id"), explode(col("letters")).alias("explode_letters")).display()


#### POS EXPLODE

In [0]:
dfc.select(posexplode(col("letters")).alias("pos_letter", "letter_value")).display()

#### SORT ARRAY

In [0]:
dfc.select(sort_array(col("letters")).alias("sorted_letters")).display()


#### REVERSE

In [0]:
dfc.select(reverse(col("letters")).alias("reversed_letters")).display()


#### ARRAY DISTINCT

In [0]:
dfc.select(array_distinct(col("letters")).alias("distinct_letters")).display()

#### ARRAY POSITION

In [0]:
dfc.select(array_position(col("letters"), "i").alias("position_i")).display()

#### ARRAY REMOVE

In [0]:
dfc.select(array_remove(col("letters"), "o").alias("removed_o")).display()


#### ARRAY REPEAT

In [0]:
dfc.select(array_repeat(col("letters"), 2).alias("repeated_letters")).display()


#### SLICE

In [0]:
dfc.select(slice(col("letters"), 2, 3).alias("sliced_letters")).display()

#### ELEMENT AT

In [0]:
dfc.select(element_at(col("letters"), 3).alias("third_element")).display()

#### ARRAYS OVERLAP

In [0]:
dfc.select(arrays_overlap(col("letters"), col("metadata.tags")).alias("overlap_letters_tags")).display()

#### CONCAT

In [0]:
dfc.select(concat(col("letters"), col("metadata.tags")).alias("concatenated")).display()


#### COLLECT LIST

In [0]:
df.select(collect_list(col("name"))).display()

#### COLLECT SET

In [0]:
df.select(collect_set(col("name"))).display()

### STRUCTS

#### GET VALUES

In [0]:
dfc.select(
    col("metadata"),
    col("metadata")["version"],
    dfc.metadata["version"],
    dfc.metadata.version,
    col("metadata").getItem("version"),
    "metadata.version"
).display()

### MAPS

#### GET VALUES

In [0]:

from pyspark.sql.functions import map_keys, map_values, size, explode, explode_outer, map_values

In [0]:
dfc.select(
    col("info"),
    col("info")["name"].alias("form_1"),
    dfc.info["name"].alias("form_2"),
    col("info").getItem("name").alias("form_3"),
    "info.name "
).display()

#### MAP VALUES

#### CREATE MAP

In [0]:
dfc.select(col("info")).display()

In [0]:
dfc.select(map_values(col("info")).alias("map_values")).display()

#### MAP KEYS

In [0]:
dfc.select(map_keys(col("info")).alias("map_keys")).display()

#### EXPLODE


Skip rows where the value to explode is null or empty.

In [0]:
dfc.select(explode(col("info")).alias("key", "value")).display()

#### EXPLODE OUTER
Preserves rows with null, setting null to exploded fields.

In [0]:
dfc.select(explode_outer(col("info")).alias("key", "value")).display()