#! https://zhuanlan.zhihu.com/p/450636026

![Image](https://pic4.zhimg.com/80/v2-5db1a82996ec388725185ae900a58008.jpg)

# PySpark 时间处理

In [1]:
from pyspark.sql import functions as F

## 示例数据

In [2]:
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]
df=spark.createDataFrame(data, ["id","time"])
df.show()

+---+----------+
| id|      time|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-03-01|
+---+----------+



## 日期

### 当前日期 `current_date()`

- 获取当前系统日期。默认情况下，数据将以`yyyy-dd-mm`格式返回。

In [3]:
df.select(F.current_date().alias("current_date")).show(1)

+------------+
|current_date|
+------------+
|  2022-01-10|
+------------+
only showing top 1 row



### 日期格式 `date_format()`

- 解析日期并转换`yyyy-dd-mm`为`MM-dd-yyyy`格式。

In [4]:
df.select(F.col("time"), 
    F.date_format(F.col("time"), "MM-dd-yyyy").alias("date_format")).show()

+----------+-----------+
|      time|date_format|
+----------+-----------+
|2020-02-01| 02-01-2020|
|2019-03-01| 03-01-2019|
|2021-03-01| 03-01-2021|
+----------+-----------+



### 使用`to_date()`将日期格式字符串`yyyy-MM-dd`转换为`DateType yyyy-MM-dd`

In [5]:
df.select(F.col("time"), 
    F.to_date(F.col("time"), "yyy-MM-dd").alias("to_date")).show()

+----------+----------+
|      time|   to_date|
+----------+----------+
|2020-02-01|2020-02-01|
|2019-03-01|2019-03-01|
|2021-03-01|2021-03-01|
+----------+----------+



### 两个日期之间的日差`datediff()`

In [6]:
df.select(F.col("time"), 
    F.datediff(F.current_date(), F.col("time")).alias("datediff")  
).show()

+----------+--------+
|      time|datediff|
+----------+--------+
|2020-02-01|     709|
|2019-03-01|    1046|
|2021-03-01|     315|
+----------+--------+



### 两个日期之间的月份`months_between()`

In [7]:
df.select(F.col("time"), 
    F.months_between(F.current_date(),F.col("time")).alias("months_between")  
).show()

+----------+--------------+
|      time|months_between|
+----------+--------------+
|2020-02-01|   23.29032258|
|2019-03-01|   34.29032258|
|2021-03-01|   10.29032258|
+----------+--------------+



### 截断指定单位的日期`trunc()`

In [8]:
df.select(F.col("time"), 
    F.trunc(F.col("time"),"Month").alias("Month_Trunc"), 
    F.trunc(F.col("time"),"Year").alias("Month_Year"), 
    F.trunc(F.col("time"),"Month").alias("Month_Trunc")).show()

+----------+-----------+----------+-----------+
|      time|Month_Trunc|Month_Year|Month_Trunc|
+----------+-----------+----------+-----------+
|2020-02-01| 2020-02-01|2020-01-01| 2020-02-01|
|2019-03-01| 2019-03-01|2019-01-01| 2019-03-01|
|2021-03-01| 2021-03-01|2021-01-01| 2021-03-01|
+----------+-----------+----------+-----------+



### 月、日加减法

In [9]:
df.select(F.col("time"), 
    F.add_months(F.col("time"),3).alias("add_months"), 
    F.add_months(F.col("time"),-3).alias("sub_months"), 
    F.date_add(F.col("time"),4).alias("date_add"), 
    F.date_sub(F.col("time"),4).alias("date_sub") 
).show()

+----------+----------+----------+----------+----------+
|      time|add_months|sub_months|  date_add|  date_sub|
+----------+----------+----------+----------+----------+
|2020-02-01|2020-05-01|2019-11-01|2020-02-05|2020-01-28|
|2019-03-01|2019-06-01|2018-12-01|2019-03-05|2019-02-25|
|2021-03-01|2021-06-01|2020-12-01|2021-03-05|2021-02-25|
+----------+----------+----------+----------+----------+



### 年、月、下一天、一年中第几个星期、一个月的最后一天

- `year`: 返回日期年份
- `month`: 返回日期月份
- `next_day`: 返回日期下一天
- `weekofyear`: 返回日期是一年中的第几个星期
- `last_day`: 返回日期当月的最后一天日期

In [10]:
df.select(F.col("time"), 
     F.year(F.col("time")).alias("year"), 
     F.month(F.col("time")).alias("month"), 
     F.next_day(F.col("time"),"Sunday").alias("next_day"), 
     F.weekofyear(F.col("time")).alias("weekofyear"),
     F.last_day('time').alias('last_day'),
).show()

+----------+----+-----+----------+----------+----------+
|      time|year|month|  next_day|weekofyear|  last_day|
+----------+----+-----+----------+----------+----------+
|2020-02-01|2020|    2|2020-02-02|         5|2020-02-29|
|2019-03-01|2019|    3|2019-03-03|         9|2019-03-31|
|2021-03-01|2021|    3|2021-03-07|         9|2021-03-31|
+----------+----+-----+----------+----------+----------+



### 星期几、月日、年日、季度
- 查询星期几
- 一个月中的第几天
- 一年中的第几天
- 该日期属于那一个季度

In [11]:
df.select(F.col("time"),  
     F.dayofweek(F.col("time")).alias("dayofweek"), 
     F.dayofmonth(F.col("time")).alias("dayofmonth"), 
     F.dayofyear(F.col("time")).alias("dayofyear"), 
     F.quarter('time').alias('quarter'),
).show()

+----------+---------+----------+---------+-------+
|      time|dayofweek|dayofmonth|dayofyear|quarter|
+----------+---------+----------+---------+-------+
|2020-02-01|        7|         1|       32|      1|
|2019-03-01|        6|         1|       60|      1|
|2021-03-01|        2|         1|       60|      1|
+----------+---------+----------+---------+-------+



## 时间

### 创建一个测试数据

In [12]:
data=[
    ["1","02-01-2020 11 01 19 06"],
    ["2","03-01-2019 12 01 19 406"],
    ["3","03-01-2021 12 01 19 406"]]
df2=spark.createDataFrame(data,["id","time"])
df2.show(truncate=False)

+---+-----------------------+
|id |time                   |
+---+-----------------------+
|1  |02-01-2020 11 01 19 06 |
|2  |03-01-2019 12 01 19 406|
|3  |03-01-2021 12 01 19 406|
+---+-----------------------+



### 以 spark 默认格式`yyyy-MM-dd HH:mm:ss`返回当前时间戳

In [13]:
df2.select(F.current_timestamp().alias("current_timestamp")).show()

+--------------------+
|   current_timestamp|
+--------------------+
|2022-01-10 21:07:...|
|2022-01-10 21:07:...|
|2022-01-10 21:07:...|
+--------------------+



### 将字符串时间戳转换为时间戳类型格式 `to_timestamp()`

In [14]:
df2.select(F.col("time"), 
    F.to_timestamp(F.col("time"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp") 
    ).show(truncate=False)

+-----------------------+-----------------------+
|time                   |to_timestamp           |
+-----------------------+-----------------------+
|02-01-2020 11 01 19 06 |2020-02-01 11:01:19.06 |
|03-01-2019 12 01 19 406|2019-03-01 12:01:19.406|
|03-01-2021 12 01 19 406|2021-03-01 12:01:19.406|
+-----------------------+-----------------------+



### `小时`、`分钟`、`秒`

In [15]:
# 数据
data=[
    ["1","2020-02-01 11:01:19.06"],
    ["2","2019-03-01 12:01:19.406"],
    ["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","time"])

# 提取小时、分钟、秒
df3.select(
    F.col("time"), 
    F.hour(F.col("time")).alias("hour"), 
    F.minute(F.col("time")).alias("minute"),
    F.second(F.col("time")).alias("second") 
    ).show(truncate=False)

+-----------------------+----+------+------+
|time                   |hour|minute|second|
+-----------------------+----+------+------+
|2020-02-01 11:01:19.06 |11  |1     |19    |
|2019-03-01 12:01:19.406|12  |1     |19    |
|2021-03-01 12:01:19.406|12  |1     |19    |
+-----------------------+----+------+------+



## 时间戳

### 将时间戳转换为日期时间

In [16]:
# 把 timestamp 秒数（从1970年开始）转成日期格式 string
time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
time_df = time_df.withColumn('ts', F.from_unixtime('unix_time'))
time_df.show()

+----------+-------------------+
| unix_time|                 ts|
+----------+-------------------+
|1428476400|2015-04-08 15:00:00|
+----------+-------------------+



### `unix_timestamp`把日期 `String` 转换成 `timestamp` 秒数，是上面操作的反操作

In [17]:
time_df = time_df.withColumn("unix_timestamp",
    F.unix_timestamp(time_df.ts, 'yyyy-MM-dd HH:mm:ss'))
time_df.show()

+----------+-------------------+--------------+
| unix_time|                 ts|unix_timestamp|
+----------+-------------------+--------------+
|1428476400|2015-04-08 15:00:00|    1428476400|
+----------+-------------------+--------------+



---