In [0]:
from pyspark.sql import functions
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType
import pandas as pd
import numpy as np

def process_latitude(latitude):
    return when(latitude.rlike('^[0-9]+$'), latitude.cast(IntegerType())).otherwise(None)

def process_longitude(longitude):
    return when(longitude.rlike('^[0-9]+$'), longitude.cast(IntegerType())).otherwise(None)

df_geo = df_geo.withColumn(
    "data",
    struct(
        col("data.ind").alias("ind"),
        # 1. Convert the timestamp column from a string to a timestamp data type
        col("data.timestamp").cast(TimestampType()).alias("timestamp"),
        col("data.latitude").alias("latitude"),
        col("data.longitude").alias("longitude"),
        col("data.country").alias("country"),
        # 2. Create a new column 'coordinates' that contains an array based on the latitude and longitude columns
        array(col("data.latitude"), col("data.longitude")).alias("coordinates")
    )
)

# 3. Reorder the DataFrame columns.
df_geo = df_geo.select(
    col("data.ind").alias("ind"),
    col("data.country").alias("country"),
    col("data.coordinates").alias("coordinates"),
    col("data.timestamp").alias("timestamp")
)

display(df_geo)