In [102]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as func
from pyspark.sql import Window
from pyspark.sql.types import *

In [103]:
sc = SparkContext(appName="step2")
spark = SparkSession(sc)

In [104]:
input_path = "./step2_input/"

In [105]:
df = spark.read.csv(input_path, header=True, inferSchema=True)
df.show()

+-------------------+-------+--------+-------------+--------------+--------------+------------+------+-----+-------------------+------------------+-------------------+-------------------+-----------------+-------------+-------------------+------------------+-------------------+-------------------+-----------------+-------------+--------------+-----------------+-----------------+-------------+------------+---------------+---------------+-----------+---------+----------+-------------------+-------------------+----------------+--------------+
|               date| Owners|discount|total_reviews|total_positive|total_negative|review_score|app_id|index|          prev_date|prev_total_reviews|prev_total_positive|prev_total_negative|prev_review_score|prev_discount|          next_date|next_total_reviews|next_total_positive|next_total_negative|next_review_score|next_discount|total_increase|positive_increase|negative_increase|days_increase|total_normal|positive_normal|negative_normal|days_normal|ra

In [106]:
df.count()

96339

In [107]:
df.dropna().count()

96339

In [108]:
df.columns

['date',
 'Owners',
 'discount',
 'total_reviews',
 'total_positive',
 'total_negative',
 'review_score',
 'app_id',
 'index',
 'prev_date',
 'prev_total_reviews',
 'prev_total_positive',
 'prev_total_negative',
 'prev_review_score',
 'prev_discount',
 'next_date',
 'next_total_reviews',
 'next_total_positive',
 'next_total_negative',
 'next_review_score',
 'next_discount',
 'total_increase',
 'positive_increase',
 'negative_increase',
 'days_increase',
 'total_normal',
 'positive_normal',
 'negative_normal',
 'days_normal',
 'raw_price',
 'sale_price',
 'total_increase_rate',
 'total_normal_rate',
 'sale_price_scale',
 'historical_low']

In [109]:
res = df.select('app_id', 'index', func.col('prev_date').alias('date'), func.col('prev_total_reviews').alias('popularity'), func.col('prev_review_score').alias('review_score'), 'discount', 'historical_low', 'sale_price_scale', func.col('days_increase').alias('days'), func.col('total_increase_rate').alias('sale_increase_rate'), func.col('total_normal_rate').alias('normal_increase_rate'))
res.show()

+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|app_id|index|               date|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|
+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|252450|    2|2015-05-14 00:00:00|       640|           6|      17|             1|               2|   4|               7.5|  3.2916666666666665|
|252450|    5|2015-06-17 00:00:00|       767|           5|      14|             1|               1|   2|               5.0|  3.6666666666666665|
|252450|    6|2015-06-19 00:00:00|       777|           5|     -17|             0|               2|   3|3.6666666666666665|  0.8888888888888888|
|252450|    8|2015-10-08 00:00:00|       884|           5|      50|             1|               1|   8|               1.5|  1.142

In [110]:
res.where(res.discount < 0).count()

1753

In [111]:
res.where(res.discount < 0).show()

+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|app_id|index|               date|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|
+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|252450|    6|2015-06-19 00:00:00|       777|           5|     -17|             0|               2|   3|3.6666666666666665|  0.8888888888888888|
|252450|  132|2021-02-13 00:00:00|      1910|           5|    -335|             0|               0|   1|               3.0|                 1.0|
|203160|    6|2015-06-22 00:00:00|     33165|           9|    -401|             0|               1|  30|37.266666666666666|                20.0|
|203160|  168|2021-07-08 00:00:00|    191329|           9|    -101|             0|               0|   1|             139.0|       

In [112]:
res = res.where(res.discount > 0)
res.count()

94534

In [113]:
res = res.withColumn("index", func.row_number().over(Window.partitionBy(func.col('app_id')).orderBy(func.monotonically_increasing_id())))

In [114]:
res.show(10)

+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|app_id|index|               date|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|
+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+
|   300|    1|2015-06-11 00:00:00|      4394|           8|      75|             1|               0|  11|10.454545454545455|  3.6025641025641026|
|   300|    2|2015-11-25 00:00:00|      5071|           8|      75|             0|               0|   6|               7.5|   4.857142857142857|
|   300|    3|2015-12-22 00:00:00|      5218|           8|      75|             0|               0|  13|13.307692307692308|               5.375|
|   300|    4|2016-02-05 00:00:00|      5563|           8|      75|             0|               0|   7| 8.714285714285714|   3.84

In [115]:
res = res.withColumn('effect_min', res.sale_increase_rate - res.normal_increase_rate)
res.show(10)

+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+
|app_id|index|               date|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|        effect_min|
+------+-----+-------------------+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+
|   300|    1|2015-06-11 00:00:00|      4394|           8|      75|             1|               0|  11|10.454545454545455|  3.6025641025641026| 6.851981351981353|
|   300|    2|2015-11-25 00:00:00|      5071|           8|      75|             0|               0|   6|               7.5|   4.857142857142857|2.6428571428571432|
|   300|    3|2015-12-22 00:00:00|      5218|           8|      75|             0|               0|  13|13.307692307692308|               5.375| 7.932692307692308|
|   300|    4|20

In [116]:
res = res.withColumnRenamed('date', 'year')
res = res.withColumn('year', func.year(res.year))
res.show(20)

+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+
|app_id|index|year|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|        effect_min|
+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+
|   300|    1|2015|      4394|           8|      75|             1|               0|  11|10.454545454545455|  3.6025641025641026| 6.851981351981353|
|   300|    2|2015|      5071|           8|      75|             0|               0|   6|               7.5|   4.857142857142857|2.6428571428571432|
|   300|    3|2015|      5218|           8|      75|             0|               0|  13|13.307692307692308|               5.375| 7.932692307692308|
|   300|    4|2016|      5563|           8|      75|             0|               0|   7| 8.71428571428571

In [117]:
res = res.withColumn('effect_plus', res.sale_increase_rate + res.normal_increase_rate)
res.show(10)

+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+------------------+
|app_id|index|year|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|        effect_min|       effect_plus|
+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+------------------+
|   300|    1|2015|      4394|           8|      75|             1|               0|  11|10.454545454545455|  3.6025641025641026| 6.851981351981353|14.057109557109557|
|   300|    2|2015|      5071|           8|      75|             0|               0|   6|               7.5|   4.857142857142857|2.6428571428571432|12.357142857142858|
|   300|    3|2015|      5218|           8|      75|             0|               0|  13|13.307692307692308|               5.375| 7.932692307692308|18.682692307

In [118]:
tags_df = spark.read.csv("./tags_input/joint_category_genre.csv", header=True, inferSchema=True)
tags_df.show(5)

+------+--------------------+----------+
|    id|            category|     genre|
+------+--------------------+----------+
|578080|    1,49,36,15,41,42|1,25,37,29|
|   550|2,1,49,36,9,38,22...|         1|
|218620|2,1,9,38,22,28,29...|       1,3|
|  4000|2,1,49,36,47,9,38...|     23,28|
|   240|  1,27,22,23,8,15,16|         1|
+------+--------------------+----------+
only showing top 5 rows



In [119]:
tags_df = tags_df.withColumn('category', func.split(func.col('category'), ',')).withColumn('genre', func.split(func.col('genre'), ','))
tags_df.show(5)

+------+--------------------+---------------+
|    id|            category|          genre|
+------+--------------------+---------------+
|578080|[1, 49, 36, 15, 4...|[1, 25, 37, 29]|
|   550|[2, 1, 49, 36, 9,...|            [1]|
|218620|[2, 1, 9, 38, 22,...|         [1, 3]|
|  4000|[2, 1, 49, 36, 47...|       [23, 28]|
|   240|[1, 27, 22, 23, 8...|            [1]|
+------+--------------------+---------------+
only showing top 5 rows



In [120]:
joint_df = res.join(tags_df, res.app_id == tags_df.id, 'inner').drop('id')
joint_df.show(5)

+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+------------------+--------------------+-----+
|app_id|index|year|popularity|review_score|discount|historical_low|sale_price_scale|days|sale_increase_rate|normal_increase_rate|        effect_min|       effect_plus|            category|genre|
+------+-----+----+----------+------------+--------+--------------+----------------+----+------------------+--------------------+------------------+------------------+--------------------+-----+
|   300|    1|2015|      4394|           8|      75|             1|               0|  11|10.454545454545455|  3.6025641025641026| 6.851981351981353|14.057109557109557|[1, 27, 22, 29, 8...|  [1]|
|   300|    2|2015|      5071|           8|      75|             0|               0|   6|               7.5|   4.857142857142857|2.6428571428571432|12.357142857142858|[1, 27, 22, 29, 8...|  [1]|
|   300|    3|2015|      

In [121]:
joint_df.count()

94159

In [122]:
joint_df = joint_df.withColumn("uid", func.row_number().over(Window.orderBy(func.monotonically_increasing_id())))

In [123]:
df1 = joint_df.select('uid', func.explode('genre').alias('genre_id'))

In [124]:
df2 = df1.groupby('uid').pivot('genre_id').agg(func.lit(1)).fillna(0)

In [125]:
import pandas as pd

genre_id = pd.read_csv("./tags_input/genre_index.csv")
genre_id = genre_id['id'].to_list()
genre_id = list(map(str, genre_id))
print(genre_id)

['1', '25', '37', '29', '3', '23', '28', '2', '4', '51', '53', '55', '57', '70', '9', '18', '73', '74', '58', '71', '72', '54', '56', '60', '59']


In [126]:
genre_id = ['1', '25', '37', '29', '3', '23', '28', '2', '4', '51', '53', '55', '57', '70', '9', '18', '73', '74', '58', '71', '72', '54', '56', '60', '59']

In [127]:
genre_col_id = [x for x in df2.columns if x in genre_id]
genre_col_name = ['gen_'+x for x in df2.columns if x in genre_id]
print(genre_col_id)
print(genre_col_name)

['1', '18', '2', '23', '25', '28', '29', '3', '37', '4', '51', '53', '54', '56', '57', '58', '60', '70', '71', '72', '73', '74', '9']
['gen_1', 'gen_18', 'gen_2', 'gen_23', 'gen_25', 'gen_28', 'gen_29', 'gen_3', 'gen_37', 'gen_4', 'gen_51', 'gen_53', 'gen_54', 'gen_56', 'gen_57', 'gen_58', 'gen_60', 'gen_70', 'gen_71', 'gen_72', 'gen_73', 'gen_74', 'gen_9']


In [128]:
for i in range(len(genre_col_id)):
    df2 = df2.withColumnRenamed(genre_col_id[i], genre_col_name[i])
df2.show(5)

+---+-----+------+-----+------+------+------+------+-----+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|uid|gen_1|gen_18|gen_2|gen_23|gen_25|gen_28|gen_29|gen_3|gen_37|gen_4|gen_51|gen_53|gen_54|gen_56|gen_57|gen_58|gen_60|gen_70|gen_71|gen_72|gen_73|gen_74|gen_9|
+---+-----+------+-----+------+------+------+------+-----+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|  1|    0|     0|    1|     1|     0|     0|     0|    0|     0|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|  2|    0|     0|    1|     1|     0|     0|     0|    0|     0|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|  3|    0|     0|    1|     1|     0|     0|     0|    0|     0|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|  4|    0|     0|    1|    

In [129]:
joint_df = joint_df.join(df2, on='uid')

In [130]:
joint_df.where(joint_df.app_id == 578080).select(genre_col_name).show()

+-----+------+-----+------+------+------+------+-----+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|gen_1|gen_18|gen_2|gen_23|gen_25|gen_28|gen_29|gen_3|gen_37|gen_4|gen_51|gen_53|gen_54|gen_56|gen_57|gen_58|gen_60|gen_70|gen_71|gen_72|gen_73|gen_74|gen_9|
+-----+------+-----+------+------+------+------+-----+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+
|    1|     0|    0|     0|     1|     0|     1|    0|     1|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|    1|     0|    0|     0|     1|     0|     1|    0|     1|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|    1|     0|    0|     0|     1|     0|     1|    0|     1|    0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|     0|    0|
|    1|     0|    0|     0|     1|     0|     1|    

In [131]:
joint_df.count()

94159

In [132]:
df1 = joint_df.select('uid', func.explode('category').alias('category_id'))
df2 = df1.groupby('uid').pivot('category_id').agg(func.lit(1)).fillna(0)

cate_id = pd.read_csv("./tags_input/category_index.csv")
cate_id = cate_id['id'].to_list()
cate_id = list(map(str, cate_id))
print(cate_id)

['1', '49', '36', '15', '41', '42', '2', '9', '38', '22', '28', '29', '13', '30', '23', '8', '16', '14', '43', '44', '35', '47', '48', '27', '17', '18', '39', '24', '51', '20', '25', '37', '32', '31', '40']


In [135]:
cate_id = ['1', '13', '14', '15', '16', '17', '18', '2', '20', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '47', '48', '49', '51', '8', '9']

In [133]:
cate_col_id = [x for x in df2.columns if x in cate_id]
cate_col_name = ['cate_'+x for x in df2.columns if x in cate_id]
print(cate_col_id)
print(cate_col_name)

['1', '13', '14', '15', '16', '17', '18', '2', '20', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '47', '48', '49', '51', '8', '9']
['cate_1', 'cate_13', 'cate_14', 'cate_15', 'cate_16', 'cate_17', 'cate_18', 'cate_2', 'cate_20', 'cate_22', 'cate_23', 'cate_24', 'cate_25', 'cate_27', 'cate_28', 'cate_29', 'cate_30', 'cate_31', 'cate_32', 'cate_35', 'cate_36', 'cate_37', 'cate_38', 'cate_39', 'cate_40', 'cate_41', 'cate_42', 'cate_43', 'cate_44', 'cate_47', 'cate_48', 'cate_49', 'cate_51', 'cate_8', 'cate_9']


In [134]:
len(cate_col_name)

35

In [136]:
for i in range(len(cate_col_id)):
    df2 = df2.withColumnRenamed(cate_col_id[i], cate_col_name[i])
df2.show(5)

+---+------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+
|uid|cate_1|cate_13|cate_14|cate_15|cate_16|cate_17|cate_18|cate_2|cate_20|cate_22|cate_23|cate_24|cate_25|cate_27|cate_28|cate_29|cate_30|cate_31|cate_32|cate_35|cate_36|cate_37|cate_38|cate_39|cate_40|cate_41|cate_42|cate_43|cate_44|cate_47|cate_48|cate_49|cate_51|cate_8|cate_9|
+---+------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+
|  1|     0|      0|      0|      0|      0|      0|      0|     1|      0|      1|      0|      0|      0|      0|      0|      1|      0|      0|      0

In [137]:
joint_df = joint_df.join(df2, on='uid')

In [141]:
joint_df.where(joint_df.app_id == 578080).select(cate_col_name).show()

+------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+
|cate_1|cate_13|cate_14|cate_15|cate_16|cate_17|cate_18|cate_2|cate_20|cate_22|cate_23|cate_24|cate_25|cate_27|cate_28|cate_29|cate_30|cate_31|cate_32|cate_35|cate_36|cate_37|cate_38|cate_39|cate_40|cate_41|cate_42|cate_43|cate_44|cate_47|cate_48|cate_49|cate_51|cate_8|cate_9|
+------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+
|     1|      0|      0|      1|      0|      0|      0|     0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      1

In [142]:
joint_df.columns

['uid',
 'app_id',
 'index',
 'year',
 'popularity',
 'review_score',
 'discount',
 'historical_low',
 'sale_price_scale',
 'days',
 'sale_increase_rate',
 'normal_increase_rate',
 'effect_min',
 'effect_plus',
 'category',
 'genre',
 'gen_1',
 'gen_18',
 'gen_2',
 'gen_23',
 'gen_25',
 'gen_28',
 'gen_29',
 'gen_3',
 'gen_37',
 'gen_4',
 'gen_51',
 'gen_53',
 'gen_54',
 'gen_56',
 'gen_57',
 'gen_58',
 'gen_60',
 'gen_70',
 'gen_71',
 'gen_72',
 'gen_73',
 'gen_74',
 'gen_9',
 'cate_1',
 'cate_13',
 'cate_14',
 'cate_15',
 'cate_16',
 'cate_17',
 'cate_18',
 'cate_2',
 'cate_20',
 'cate_22',
 'cate_23',
 'cate_24',
 'cate_25',
 'cate_27',
 'cate_28',
 'cate_29',
 'cate_30',
 'cate_31',
 'cate_32',
 'cate_35',
 'cate_36',
 'cate_37',
 'cate_38',
 'cate_39',
 'cate_40',
 'cate_41',
 'cate_42',
 'cate_43',
 'cate_44',
 'cate_47',
 'cate_48',
 'cate_49',
 'cate_51',
 'cate_8',
 'cate_9']

In [144]:
joint_df.drop('category', 'genre').coalesce(1).write.options(header='True', delimiter=',').csv("./analysis_input/")

In [145]:
sc.stop()