### Import Data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# 唯一需要修改的部分
BASE_PATH = 'path'


def dict_to_schema(col_type_dict):
    return StructType([
        StructField(col, typ, True) for col, typ in col_type_dict.items()
    ])



# Create a Spark session
spark = SparkSession.builder \
    .appName("Simple Example") \
    .getOrCreate()

# define the schema for each DataFrame if needed
member_schema_dict = {
    "ShopId": IntegerType(),
    "ShopMemberId": StringType(),
    "RegisterSourceTypeDef": StringType(),
    "RegisterDateTime": StringType(),  # Use StringType for datetime, then convert if needed
    "Gender": StringType(),
    "Birthday": StringType(),
    "APPRefereeId": IntegerType(),
    "APPRefereeLocationId": IntegerType(),
    "IsAppInstalled": StringType(),  # Boolean (True/False as strings in CSV)
    "IsEnableEmail": StringType(),
    "IsEnablePushNotification": StringType(),
    "IsEnableShortMessage": StringType(),
    "FirstAppOpenDateTime": StringType(),
    "LastAppOpenDateTime": StringType(),
    "MemberCardLevel": IntegerType(),
    "CountryAliasCode": StringType(),
}

order_tg_schema_dict = {
    "ShopId": IntegerType(),
    "ShopMemberId": StringType(),
    "TradesGroupCode": StringType(),
    "OrderDateTime": StringType(),
    "ChannelType": StringType(),
    "ChannelDetail": StringType(),
    "PaymentType": StringType(),
    "ShippingType": StringType(),
    "TsCount": IntegerType(),
    "Qty": IntegerType(),
    "TotalSalesAmount": DoubleType(),
    "TotalPrice": DoubleType(),
    "TotalDiscount": DoubleType(),
    "TotalPromotionDiscount": DoubleType(),
    "TotalCouponDiscount": DoubleType(),
    "TotalLoyaltyPointDiscount": DoubleType(),
    "StatusDef": StringType(),
}

order_ts_schema_dict = {
    "ShopId": IntegerType(),
    "ShopMemberId": StringType(),
    "TradesGroupCode": StringType(),
    "TradesSlaveCode": StringType(),
    "OrderDateTime": StringType(),
    "OrderFinishDateTime": StringType(),
    "ChannelType": StringType(),
    "ChannelDetail": StringType(),
    "PaymentType": StringType(),
    "ShippingType": StringType(),
    "OuterProductSkuCode": StringType(),
    "ProductSkuCode": StringType(),
    "SalePageId": StringType(),  # NOTE: StringType here!
    "Qty": IntegerType(),
    "UnitPrice": DoubleType(),
    "SubtotalPrice": DoubleType(),
    "SubtotalSalesAmount": DoubleType(),
    "SubtotalPromotionDiscount": DoubleType(),
    "SubtotalCouponDiscount": DoubleType(),
    "SubtotalLoyaltyPointDiscount": DoubleType(),
    "StatusDef": StringType(),
}

sale_page_schema_dict = {
    "ShopId": IntegerType(),
    "SalePageId": StringType(),  # Even if described as integer, better as String for joins
    "SalePageTitle": StringType(),
    "SaleProductDescShortContent": StringType(),
}

segment_schema_dict = {
    "ShopId": IntegerType(),
    "ShopMemberId": StringType(),
    "DataSourceDate": StringType(),  # Use StringType for date, convert later if needed
    "CategorySegment": StringType(),
}

schema_map = {
    'df_member': dict_to_schema(member_schema_dict),
    'df_order_tg': dict_to_schema(order_tg_schema_dict),
    'df_order_ts': dict_to_schema(order_ts_schema_dict),
    'df_sale_page': dict_to_schema(sale_page_schema_dict),
    'df_segment': dict_to_schema(segment_schema_dict),
}

file_map = {
    'df_member': 'Member.csv',
    'df_order_tg': 'Order_TG.csv',
    'df_order_ts': 'Order_TS.csv',
    'df_sale_page': 'SalePage.csv',
    'df_segment': 'Segment.csv',
}

# Read CSV files with specified schemas
for df_name, file_name in file_map.items():
    file_path = f"{BASE_PATH}/{file_name}"
    schema = schema_map[df_name]
    df = spark.read.csv(file_path, header=True, schema=schema)
    globals()[df_name] = df  # Store DataFrame in a variable with the name df_name

# Show the loaded data
for df_name in file_map.keys():
    print(f"{df_name}:")
    globals()[df_name].show(5)
    globals()[df_name].printSchema()

df_member:
+------+--------------------+---------------------+--------------------+------+----------+------------+--------------------+--------------+-------------+------------------------+--------------------+--------------------+--------------------+---------------+----------------+
|ShopId|        ShopMemberId|RegisterSourceTypeDef|    RegisterDateTime|Gender|  Birthday|APPRefereeId|APPRefereeLocationId|IsAppInstalled|IsEnableEmail|IsEnablePushNotification|IsEnableShortMessage|FirstAppOpenDateTime| LastAppOpenDateTime|MemberCardLevel|CountryAliasCode|
+------+--------------------+---------------------+--------------------+------+----------+------------+--------------------+--------------+-------------+------------------------+--------------------+--------------------+--------------------+---------------+----------------+
|  NULL|SGE559ZPZi96JMssg...|                Store|2018-12-30 00:00:...|Female|1966-08-15|           0|                   0|          True|         True|           

25/05/24 21:46:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: ShopId, ShopMemberId, TradesGroupCode, TradesSlaveCode, OrderDateTime, OrderFinishDateTime, ChannelType, ChannelDetail, PaymentType, ShippingType, OuterProductSkuCode, ProductSkuCode, SalePageId, Qty, UnitPrice, SubtotalSalesAmount, SubtotalPrice, SubtotalPromotionDiscount, SubtotalCouponDiscount, SubtotalLoyaltyPointDiscount, StatusDef
 Schema: ShopId, ShopMemberId, TradesGroupCode, TradesSlaveCode, OrderDateTime, OrderFinishDateTime, ChannelType, ChannelDetail, PaymentType, ShippingType, OuterProductSkuCode, ProductSkuCode, SalePageId, Qty, UnitPrice, SubtotalPrice, SubtotalSalesAmount, SubtotalPromotionDiscount, SubtotalCouponDiscount, SubtotalLoyaltyPointDiscount, StatusDef
Expected: SubtotalPrice but found: SubtotalSalesAmount
CSV file: file:///Users/bryant_lue/Downloads/91APP_Dataset(會員&主單&子單&商品頁&標籤)/Order_TS.csv


### 確認 SalePageId 是否全部都是缺失值

In [23]:
# Reassign from globals to local variables
df_member = globals()['df_member']
df_order_tg = globals()['df_order_tg']
df_order_ts = globals()['df_order_ts']
df_sale_page = globals()['df_sale_page']
df_segment = globals()['df_segment']


null_count = df_order_ts.filter(df_order_ts['SalePageId'].isNull()).count()
not_null_count = df_order_ts.filter(df_order_ts['SalePageId'].isNotNull()).count()
print(f"Null SalePageId count: {null_count}")
print(f"Not Null SalePageId count: {not_null_count}")



Null SalePageId count: 33963994
Not Null SalePageId count: 12560835


                                                                                

In [None]:
# Stop the Spark session
# spark.stop()