### Convert Data types

In [1]:
df = spark.read.table("MtoMActual")

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, aaebc34e-a6b5-4276-9602-9b514b448dda)

In [3]:
df = df.select(df.Country, df.Location, df.Actual.cast("int"))
display(df)

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, dc5b3b2b-1676-41a9-bc8b-63a1155ba947)

In [2]:
display(df.describe(["Country", "Actual"]).show())
# 会出现这个结果是因为Actual这一栏是 txt，7000是最大的 11000是最小的

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 4, Finished, Available, Finished)

+-------+-------+------------------+
|summary|Country|            Actual|
+-------+-------+------------------+
|  count|      6|                 6|
|   mean|   NULL| 7166.666666666667|
| stddev|   NULL|4020.7793606049395|
|    min|England|             11000|
|    max|  Italy|              7000|
+-------+-------+------------------+



In [4]:
%%sql
SELECT Country, Location, CAST(Actual as int)
FROM MtoMActual

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 6, Finished, Available, Finished)

<Spark SQL result set with 6 rows and 3 fields>

### Importing data using an explicit data structure
explicit 明确的

In [9]:
from pyspark.sql.types import *
schemaTarget = StructType([
    StructField('Country', StringType(), True), 
    StructField('Location', StringType(), True), 
    StructField('Actual', IntegerType(), True)
    ])
df = spark.read.option("header", "true").format("csv").schema(schemaTarget).load("Files/MtoMActual2.csv")
df.schema
display(df)
df.write.format("delta").saveAsTable("mtomactualstruct")

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 24283786-66a8-490d-a319-0aafc26c3d4e)

In [11]:
df = spark.read.table("MtoMActualStruct")
display(df)

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c949874d-3814-48b4-a225-3120d1aa2080)

### Formatting dates as String

这里的 "d MMM yyyy" 是一种 日期格式字符串，用来定义日期怎么显示或转换成字符串。

具体含义如下：
| 格式符号   | 说明            | 示例（2025-08-12） |
| ------ | ------------- | -------------- |
| `d`    | 一个月中的天数（不补0）  | `12`           |
| `dd`   | 一个月中的天数（补0）   | `12`           |
| `MMM`  | 月份的英文缩写（三个字母） | `Aug`          |
| `MMMM` | 月份的全称         | `August`       |
| `yyyy` | 四位数年份         | `2025`         |
| `yy`   | 两位数年份         | `25`           |



In [26]:
from pyspark.sql.functions import *
df = spark.read.table("mtomactualwithdates")
df = df.select(df.Country, df.Location, df.Actual.cast("int"), \
        concat(lit("The date is: "), date_format(df.ColDate, "EEEE d MMM yyyy").alias("FormattedDate"))
        )
display(df)

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 28, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 72a764af-a9dc-415e-b3c0-e9f6cf67f881)

In [27]:
%%sql
SELECT Country, Location, CAST(Actual as int), concat("The date is: ", date_format(ColDate, "EEEE d MMM yyyy")) AS FormattedDate
FROM mtomactualwithdates

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 29, Finished, Available, Finished)

<Spark SQL result set with 6 rows and 4 fields>

### Aggregating and re-filtering data

In [43]:
df = spark.read.table("mtomactualstruct")
display(df.groupBy("Country").sum("Actual") \
        .withColumnRenamed("sum(Actual)", "ActualTotal") \
        .where("ActualTotal > 10000") # 注意这里不能写成df形式，因为df一直没改变
        )
display(df.groupBy("Country").agg(sum("Actual").alias("ActualTotal")).where("ActualTotal > 10000"))# 也可以这么rename
# display(df.groupBy(df.Country).sum(df.Actual)) 这样的写法是错误的，语法不支持

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 45, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4f38f15e-38cd-41e6-a5fe-918e8409e6b9)

SynapseWidget(Synapse.DataFrame, 58826678-0a63-48eb-b858-2f4a3c9c7457)

In [65]:
%%sql
SELECT Country, sum(Actual) AS ActualTotal
FROM mtomactualstruct
GROUP BY Country
HAVING ActualTotal > 10000
-- or
-- HAVING sum(Actual) > 10000

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 67, Finished, Available, Finished)

<Spark SQL result set with 2 rows and 2 fields>

### Sorting the results

In [61]:
display(df.orderBy("Location"))
display(df.orderBy(desc(df.Country), df.Location))
# sort() 和 orderBy() 功能是一样的，基本可以互换使用
display(df.sort(desc("Country"), "Location"))
display(df.sort(desc(df.Country), df.Location))
display(df.sort(df.Country.desc(), df.Location.asc())) # 也可以不写asc() 效果等价，写了更清晰
display(df.sort(df.Country, ascending=False)) # 只支持对单列指定排序顺序（升序或降序）

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 63, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 37ea0497-c763-4c7c-a096-5de618e0b7b3)

SynapseWidget(Synapse.DataFrame, 6fe89844-3f5e-427c-a083-f93813ae93fa)

SynapseWidget(Synapse.DataFrame, 24b990ea-2c8c-4b0f-8799-59df1b99ec6b)

SynapseWidget(Synapse.DataFrame, 7b466d28-79ba-4c03-ae6e-a49ccea385b5)

SynapseWidget(Synapse.DataFrame, 962b85b6-5784-4fa4-b068-fd5edf596f8b)

SynapseWidget(Synapse.DataFrame, 8d2e3543-436c-4241-8d9c-2d0eba97764f)

In [56]:
%%sql
SELECT *
FROM mtomactualstruct
ORDER BY Country DESC, Location  -- default ASC, if put it will make it clear

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 58, Finished, Available, Finished)

<Spark SQL result set with 6 rows and 3 fields>

### Using all 6 SQL Clauses

In [70]:
display(df.select(df.Country, df.Actual) \
          .where(df.Actual > 4000) \
          .groupBy("Country").sum("Actual") \
          .withColumnRenamed("sum(Actual)", "ActualTotal") \
          .where("ActualTotal > 10000") \
          .orderBy(desc("ActualTotal"), "Country")
    )

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 72, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 415a0cc6-ecdc-456f-b26c-44177dd16fc0)

In [84]:
%%sql
SELECT Country, SUM(Actual) as ActualTotal -- 之前这里出现过问题，在于我group by Country，同时还select了需要做sum的 Actual
FROM mtomactualstruct
WHERE Actual > 4000
GROUP BY Country
HAVING SUM(Actual) > 10000
ORDER BY ActualTotal DESC, Country 

StatementMeta(, 470b7f56-e58a-438f-a0c2-6f81432ac399, 86, Finished, Available, Finished)

<Spark SQL result set with 2 rows and 2 fields>