In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType,StructField, StringType, LongType, DateType

import os

In [2]:
spark = SparkSession.builder.config("spark.driver.memory", "8g").getOrCreate()

In [3]:
path = r'C:\Users\ADMIN\PycharmProjects\Bigdata-1\Data\Dataset\Clean_data'
list_file = os.listdir(path)

In [5]:
emptyRDD = spark.sparkContext.emptyRDD()

schema = StructType([
    StructField('Contract', StringType(), True), 
    StructField('Type', StringType(), False), 
    StructField('Giải Trí', LongType(), True), 
    StructField('Phim Truyện', LongType(), True), 
    StructField('Thiếu Nhi', LongType(), True), 
    StructField('Thể Thao', LongType(), True), 
    StructField('Truyền Hình', LongType(), True), 
    StructField('Date', DateType(), False)])

monthly_df = spark.createDataFrame(emptyRDD,schema)
monthly_df.printSchema()

root
 |-- Contract: string (nullable = true)
 |-- Type: string (nullable = false)
 |-- Giải Trí: long (nullable = true)
 |-- Phim Truyện: long (nullable = true)
 |-- Thiếu Nhi: long (nullable = true)
 |-- Thể Thao: long (nullable = true)
 |-- Truyền Hình: long (nullable = true)
 |-- Date: date (nullable = false)



In [6]:
for i in range(len(list_file)):
    df = spark.read.csv(path + '\\' +  list_file[i], header= True)
    monthly_df = monthly_df.union(df)

In [7]:
monthly_df.show(10)

+---------+-----------+--------+-----------+---------+--------+-----------+----------+
| Contract|       Type|Giải Trí|Phim Truyện|Thiếu Nhi|Thể Thao|Truyền Hình|      Date|
+---------+-----------+--------+-----------+---------+--------+-----------+----------+
|DNH014998|Phim Truyện|    null|     3365.0|     null|    null|       null|2022-04-01|
|HND486882|Phim Truyện|    null|     5545.0|     null|    null|       null|2022-04-01|
|HUFD07189|Truyền Hình|    null|       null|     null|    null|     2264.0|2022-04-01|
|HDFD36288|Truyền Hình|    null|       null|     null|    null|    11904.0|2022-04-01|
|CTFD04401|Truyền Hình|    null|       null|     null|    null|    55881.0|2022-04-01|
|HNH954607|Phim Truyện|    null|    13115.0|     null|    null|       null|2022-04-01|
|HNH855959|Truyền Hình|    null|       null|     null|    null|      327.0|2022-04-01|
|SGH034683|Truyền Hình|    null|       null|     null|    null|    82195.0|2022-04-01|
|NTFD35330|Truyền Hình|    null|       null

In [12]:
month_summary = monthly_df.groupBy('Contract').agg(sum('Giải Trí').alias('Giai_Tri'),
                                   sum('Phim Truyện').alias('Phim_Truyen'),
                                   sum('Thiếu Nhi').alias('Thieu_Nhi'),
                                   sum('Thể Thao').alias('The_Thao'),
                                   sum('Truyền Hình').alias('Truyen_Hinh'),
                                   max('Date').alias('Recent'),
                                   countDistinct('Date').alias('Frequency'))

In [14]:
month_summary.count()

1920545

In [10]:
month_summary.show(10)

+--------------+--------+-----------+---------+--------+-----------+----------+---------+
|      Contract|Giai_Tri|Phim_Truyen|Thieu_Nhi|The_Thao|Truyen_Hinh|    Recent|Frequency|
+--------------+--------+-----------+---------+--------+-----------+----------+---------+
|113.182.209.48|    89.0|       null|     null|    null|       63.0|2022-04-01|        1|
|14.182.110.125|    92.0|       null|     null|    null|      404.0|2022-04-10|        1|
|     AGAAA0338|    null|       null|     null|    null|   278633.0|2022-04-30|       30|
|     AGAAA0342|   204.0|       null|     null|    null|   117788.0|2022-04-30|       12|
|     AGAAA0346|    null|       null|     null|    null|  2056249.0|2022-04-30|       30|
|     AGAAA0353|    null|     1665.0|     null|    null|    25982.0|2022-04-30|       29|
|     AGAAA0372|    null|       null|     null|    null|    13123.0|2022-04-30|       27|
|     AGAAA0391|   373.0|      129.0|     null|    null|   158931.0|2022-04-30|       11|
|     AGAA

In [11]:
month_summary.printSchema()

root
 |-- Contract: string (nullable = true)
 |-- Giai_Tri: double (nullable = true)
 |-- Phim_Truyen: double (nullable = true)
 |-- Thieu_Nhi: double (nullable = true)
 |-- The_Thao: double (nullable = true)
 |-- Truyen_Hinh: double (nullable = true)
 |-- Recent: string (nullable = true)
 |-- Frequency: long (nullable = false)



In [15]:
month_summary = month_summary.withColumn('Recent', to_date(col('Recent'), 'yyyy-MM-dd'))

In [16]:
month_summary = month_summary.withColumn('Diff_Day', datediff(to_date(lit('2022-05-01')), month_summary['Recent']))

In [17]:
month_summary = month_summary.withColumn('Recent_Score', when(col('Diff_Day') <= 3, 4)
                                        .when((col('Diff_Day') >= 4) & (col('Diff_Day') <= 7), 3)
                                        .when((col('Diff_Day') >= 7) & (col('Diff_Day') <= 14), 2)
                                        .otherwise(4))

In [18]:
month_summary = month_summary.withColumn('Frequency_Score', when(col('Frequency') <= 4, 1)
                                        .when((col('Frequency') >= 5) & (col('Frequency') <= 12), 2)
                                        .when((col('Frequency') >= 12) & (col('Frequency') <= 20), 3)
                                        .otherwise(4))

In [19]:
month_summary = month_summary.withColumn('RF_Score', concat_ws("", month_summary['Recent_Score'], month_summary['Frequency_Score']))

In [21]:
customer_segment = month_summary.groupBy('RF_Score').count()

In [22]:
customer_segment.repartition(1).write.csv(r"C:\\Users\\ADMIN\\PycharmProjects\\Bigdata-1\\Data\Dataset\\Customer_segment_data\\", header=True)

In [49]:
from pyspark.sql.window import Window
customer_segment = customer_segment.withColumn('total', sum('count').over(Window.partitionBy())).withColumn('percent', (col('count') / col('total')) * 100).orderBy('percent', ascending= False)

In [50]:
customer_segment.show(20)

+--------+-------+-------+-------------------+
|RF_Score|  count|  total|            percent|
+--------+-------+-------+-------------------+
|      44|1171794|1920545|   61.0136185301568|
|      43| 204675|1920545| 10.657131178910154|
|      42| 162209|1920545|  8.445987987784717|
|      41| 142813|1920545|  7.436066324923394|
|      32|  55286|1920545| 2.8786620464503567|
|      21|  53195|1920545| 2.7697867011707613|
|      31|  41733|1920545| 2.1729769414411013|
|      22|  33113|1920545| 1.7241460106376054|
|      33|  28436|1920545| 1.4806213861169615|
|      34|  14218|1920545| 0.7403106930584807|
|      23|  11167|1920545| 0.5814495364597028|
|      24|   1906|1920545|0.09924266288996093|
+--------+-------+-------+-------------------+



In [8]:
df = monthly_df.groupBy('Contract').agg(sum('Giải Trí').alias('Giai_Tri'),
                                   sum('Phim Truyện').alias('Phim_Truyen'),
                                   sum('Thiếu Nhi').alias('Thieu_Nhi'),
                                   sum('Thể Thao').alias('The_Thao'),
                                   sum('Truyền Hình').alias('Truyen_Hinh'))

In [9]:
df = df.na.fill(0)

In [10]:
df.show(10)

+---------+--------+-----------+---------+--------+-----------+
| Contract|Giai_Tri|Phim_Truyen|Thieu_Nhi|The_Thao|Truyen_Hinh|
+---------+--------+-----------+---------+--------+-----------+
|HTFD11598|     0.0|    15551.0|      0.0|     0.0|    42919.0|
|HPFD48556|    69.0|        0.0|      0.0|     0.0|  1468328.0|
|NBFD10014|     0.0|        0.0|      0.0|     0.0|  1596494.0|
|HNH619088|     0.0|    77275.0|  11361.0|     0.0|   917930.0|
|HNH036174|     0.0|    62674.0|      0.0|     0.0|   354879.0|
|DNH067877|     0.0|        0.0|      0.0|     0.0|   181308.0|
|SGH806190|     0.0|        0.0|      0.0|     0.0|   217779.0|
|HNH582022|     0.0|        0.0|      0.0|     0.0|  2209949.0|
|HNH795510|     0.0|    30197.0|    265.0|     0.0|  1196936.0|
|DNFD91557|     0.0|        0.0|      0.0|     0.0|    95567.0|
+---------+--------+-----------+---------+--------+-----------+
only showing top 10 rows



In [11]:
save_path = r"C:\\Users\\ADMIN\\PycharmProjects\\Bigdata-1\\Data\Dataset\\Clean_data_1\\"
df.repartition(1).write.csv(save_path, header=True)

In [9]:
from pyspark.ml.feature import VectorAssembler

assemble=VectorAssembler(inputCols=['Giai_Tri', 'Phim_Truyen', 'Thieu_Nhi', 'The_Thao', 'Truyen_Hinh'], outputCol='features')

In [10]:
assembled_data = assemble.transform(df)

In [69]:
assembled_data.show(10)

+---------+--------+-----------+---------+--------+-----------+--------------------+
| Contract|Giai_Tri|Phim_Truyen|Thieu_Nhi|The_Thao|Truyen_Hinh|            features|
+---------+--------+-----------+---------+--------+-----------+--------------------+
|HTFD11598|     0.0|    15551.0|      0.0|     0.0|    42919.0|(5,[1,4],[15551.0...|
|HPFD48556|    69.0|        0.0|      0.0|     0.0|  1468328.0|(5,[0,4],[69.0,14...|
|NBFD10014|     0.0|        0.0|      0.0|     0.0|  1596494.0| (5,[4],[1596494.0])|
|HNH619088|     0.0|    77275.0|  11361.0|     0.0|   917930.0|[0.0,77275.0,1136...|
|HNH036174|     0.0|    62674.0|      0.0|     0.0|   354879.0|(5,[1,4],[62674.0...|
|DNH067877|     0.0|        0.0|      0.0|     0.0|   181308.0|  (5,[4],[181308.0])|
|SGH806190|     0.0|        0.0|      0.0|     0.0|   217779.0|  (5,[4],[217779.0])|
|HNH582022|     0.0|        0.0|      0.0|     0.0|  2209949.0| (5,[4],[2209949.0])|
|HNH795510|     0.0|    30197.0|    265.0|     0.0|  1196936.0|[0

In [11]:
from pyspark.ml.feature import StandardScaler
scale = StandardScaler(inputCol='features', outputCol='standardized')
data_scale = scale.fit(assembled_data)
data_scale_output = data_scale.transform(assembled_data)

In [71]:
data_scale_output.show(10)

+---------+--------+-----------+---------+--------+-----------+--------------------+--------------------+
| Contract|Giai_Tri|Phim_Truyen|Thieu_Nhi|The_Thao|Truyen_Hinh|            features|        standardized|
+---------+--------+-----------+---------+--------+-----------+--------------------+--------------------+
|HTFD11598|     0.0|    15551.0|      0.0|     0.0|    42919.0|(5,[1,4],[15551.0...|(5,[1,4],[0.01469...|
|HPFD48556|    69.0|        0.0|      0.0|     0.0|  1468328.0|(5,[0,4],[69.0,14...|(5,[0,4],[0.00809...|
|NBFD10014|     0.0|        0.0|      0.0|     0.0|  1596494.0| (5,[4],[1596494.0])|(5,[4],[3.1597689...|
|HNH619088|     0.0|    77275.0|  11361.0|     0.0|   917930.0|[0.0,77275.0,1136...|[0.0,0.0730445752...|
|HNH036174|     0.0|    62674.0|      0.0|     0.0|   354879.0|(5,[1,4],[62674.0...|(5,[1,4],[0.05924...|
|DNH067877|     0.0|        0.0|      0.0|     0.0|   181308.0|  (5,[4],[181308.0])|(5,[4],[0.3588434...|
|SGH806190|     0.0|        0.0|      0.0|    

In [73]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized', metricName='silhouette', distanceMeasure='squaredEuclidean')

for i in range(2,10):    
    KMeans_algo = KMeans(featuresCol='standardized', k=i)
    KMeans_fit = KMeans_algo.fit(data_scale_output)
    output = KMeans_fit.transform(data_scale_output)
    score = evaluator.evaluate(output)
    silhouette_score.append(score)
    print("Silhouette Score:",score)

Silhouette Score: 0.9999904478750545
