In [3]:
spark.sql('DROP TABLE txkt.dim_video_df')

DataFrame[]

In [5]:
spark.sql(
    '''
        CREATE TABLE IF NOT EXISTS txkt.dim_video_df
         (
          `video_id`             STRING COMMENT '视频ID',
          `video_title`          STRING COMMENT '视频标题',
          `video_type1`          STRING COMMENT '视频大类',
          `video_type2`          STRING COMMENT '视频中类',
          `video_type3`          STRING COMMENT '视频小类',
          `video_url`            STRING COMMENT '视频url',
          `organ_name`           STRING COMMENT '视频所属机构名称',
          `video_sections_num`   BIGINT COMMENT '视频节数',
          `video_index_page`     BIGINT COMMENT '视频所在页数',
          `price`                BIGINT COMMENT '收费信息(单位：分)',
          `person_num`           BIGINT COMMENT '购买/报名人数',
          `recently_study_num`   BIGINT COMMENT '最近在学',
          `sign_up_num`          BIGINT COMMENT '累计报名',
          `video_praise_degree`  DOUBLE COMMENT '好评度',
          `etl_time`             STRING COMMENT '数据加工时间'
        ) 
        PARTITIONED BY (`dt` STRING) 
        STORED AS PARQUET
    '''
)


DataFrame[]

In [6]:
spark.sql(
    'INSERT OVERWRITE TABLE txkt.dim_video_df PARTITION (dt)' + 
    '''
        SELECT SPLIT(video_url, '/')[4] AS video_id,
               video_title, 
               video_type1,        
               video_type2,         
               video_type3,                 
               video_url,           
               organ_name, 
               CAST(CASE WHEN video_sections_num IS NOT NULL THEN video_sections_num ELSE 0 END AS BIGINT) AS video_sections_num,  
               video_index_page,  
               CAST(CASE WHEN price              IS NOT NULL THEN price              ELSE 0 END AS BIGINT) AS price,  
               CAST(CASE WHEN person_num         IS NOT NULL THEN person_num         ELSE 0 END AS BIGINT) AS person_num,  
               CAST(CASE WHEN recently_study_num IS NOT NULL THEN recently_study_num ELSE 0 END AS BIGINT) AS recently_study_num,  
               CAST(CASE WHEN sign_up_num IS NOT NULL THEN REPLACE(recently_study_num, '万', '0000') ELSE 0 END AS BIGINT) AS sign_up_num,  
               CASE WHEN video_praise_degree > '0' THEN CAST(REPLACE(video_praise_degree, '%', '') AS INT)/100 ELSE 0 END  AS video_praise_degree,            
               FROM_UNIXTIME(CAST(NOW() AS BIGINT), 'yyyy-MM-dd HH:mm:ss') AS elt_time,
               DATE_FORMAT(TO_TIMESTAMP(grab_time, 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd') AS dt 
          FROM ( 
                 SELECT video_title,  
                        video_type1,       
                        video_type2,         
                        video_type3,                 
                        video_url,           
                        organ_name, 
                        video_sections_num,  
                        video_index_page,            
                        price,               
                        person_num,          
                        recently_study_num,  
                        sign_up_num,         
                        video_praise_degree,
                        grab_time,
                        ROW_NUMBER() OVER(PARTITION BY video_url ORDER BY grab_time DESC) AS rk
                   FROM txkt.ods_tencent_study_video
                  WHERE dt='{dt}' 
               ) AS a
         WHERE rk=1
    '''.format(dt='2022-03-27')
)

DataFrame[]

In [7]:
spark.sql("SELECT * FROM txkt.dim_video_df WHERE dt='2022-03-27' LIMIT 10;").toPandas()

Unnamed: 0,video_id,video_title,video_type1,video_type2,video_type3,video_url,organ_name,video_sections_num,video_index_page,price,person_num,recently_study_num,sign_up_num,video_praise_degree,etl_time,dt
0,1044635,福建教招语文第三课：过山车式的文言文，你别倒过来，行吗？,考试·考证,公考教资,教资教招,https://ke.qq.com/course/1044635,闽试教育,1,19,0,3,24,24,0.0,2022-03-27 13:24:50,2022-03-27
1,1182147,二建建筑实物记忆,考试·考证,建造工程,二级建造师,https://ke.qq.com/course/1182147,鹏程记忆,3,8,0,1,2,2,0.0,2022-03-27 13:24:50,2022-03-27
2,1182203,2分钟带你认识移动直播SDK,IT·互联网,前沿技术,云计算,https://ke.qq.com/course/1182203,腾讯云,1,17,0,55,1,1,0.0,2022-03-27 13:24:50,2022-03-27
3,121040,亿启教育股票（金融）占星交易,兴趣·生活,投资理财,股票,https://ke.qq.com/course/121040,亿启教育（证券投资教育）,15,16,18800,1,0,0,0.9,2022-03-27 13:24:50,2022-03-27
4,1237432,专业歌手演唱技巧训练班（直播课）,兴趣·生活,音乐乐器,唱歌发声,https://ke.qq.com/course/1237432,玄博SO-E音乐课堂,38,4,490000,1,0,0,1.0,2022-03-27 13:24:50,2022-03-27
5,1294765,无用编程之—Scratch入门—S01,IT·互联网,后台开发,其他,https://ke.qq.com/course/1294765,景洪景乐网络科技有限公司,1,17,0,1,2,2,0.0,2022-03-27 13:24:50,2022-03-27
6,1294967,英语国际贸易词汇精读与精讲02,语言·留学,英语,词汇语法,https://ke.qq.com/course/1294967,深圳市佳域通科技实业有限公司,1,12,768,3,0,0,0.0,2022-03-27 13:24:50,2022-03-27
7,1406155,自考 线性代数（经管类）,考试·考证,大学学历,自考,https://ke.qq.com/course/1406155,方子春,6,34,23500,2,0,0,0.0,2022-03-27 13:24:50,2022-03-27
8,1466474,腾讯云服务器产品快速配置Windows实例概念解析,IT·互联网,前沿技术,云计算,https://ke.qq.com/course/1466474,腾讯云,1,12,0,2,5,5,0.0,2022-03-27 13:24:50,2022-03-27
9,1498676,ps动画gif动图闪图设计制作淘宝美工视频教程,设计·创作,工业产品设计,产品设计,https://ke.qq.com/course/1498676,南通实战王淘宝美工运营培训,2,17,0,11,44,44,0.0,2022-03-27 13:24:50,2022-03-27


In [57]:
spark.sql(
    '''
    SELECT video_id, COUNT(*) AS cnt
      FROM txkt.dim_video_df
     WHERE dt='2022-03-24'
  GROUP BY video_id
    HAVING cnt>1
    '''
).show()

+--------+---+
|video_id|cnt|
+--------+---+
+--------+---+



In [21]:
spark.sql(
    '''
    SELECT '' > '0', '1%' > '0'
    '''
).show()

+------+--------+
|( > 0)|(1% > 0)|
+------+--------+
| false|    true|
+------+--------+



In [58]:
spark.sql(
    '''
    SELECT SPLIT(video_url, '/')[4], video_url
      FROM txkt.dim_video_df
     WHERE dt='2022-03-24'
    '''
).show(2, False)

+--------------------------+--------------------------------+
|split(video_url, /, -1)[0]|video_url                       |
+--------------------------+--------------------------------+
|https:                    |https://ke.qq.com/course/1103217|
|https:                    |https://ke.qq.com/course/1129012|
+--------------------------+--------------------------------+
only showing top 2 rows

