In [3]:
from pyspark.sql import SparkSession
import os

In [4]:
# 현재 작업 디렉토리를 가져옴
current_directory = os.getcwd()

# 현재 경로 출력
print("현재 경로:", current_directory)

현재 경로: /work


In [5]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:3.3.4" pyspark-shell'

In [6]:
spark = SparkSession.builder \
    .appName("analysis") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .getOrCreate()

/spark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-410ededd-1ab8-4264-9708-45fe97b0e29c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 5203ms :: artifacts dl 503ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--------------------------------------------

In [7]:
# comments_csv 디렉토리 경로
directory = "comments_csv"

# 디렉토리 내 모든 파일 경로를 가져옴
files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".csv")]
print("파일 개수 : " , len(files))

In [23]:
# CSV 파일들을 읽어서 DataFrame으로 로드
df = spark.read.csv(files, header=True, inferSchema=True)

                                                                                

+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|                             comment|comment_id|epi_no|is_best|login_id|     nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|      넌 아직 건너오면 안된다... ...| 327625429|   335|   true|asd5****|케이지 사랑해|     12790|        2|1.686477496450047E9|  616239|          50|2017-07-10 14:17:32|
|  그림이랑 개 똑같이 생겼엌ㅋㅋㅋ...| 327623979|   335|   true|urmy****|     오오오오|      8858|        0|1.686477500056887E9|  616239|          55|2017-07-10 14:16:01|
|                 사지말고 입양하세요| 327626850|   335|   true|ghn0****|         스디|      7839|        0|1.686477502453198E9|  616239|         126|2017-07-10 14:19:05|
|  여러분 이번화 보다가 생각났는

In [24]:
# DataFrame 확인
df.show()

+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|                             comment|comment_id|epi_no|is_best|login_id|     nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|      넌 아직 건너오면 안된다... ...| 327625429|   335|   true|asd5****|케이지 사랑해|     12790|        2|1.686477496450047E9|  616239|          50|2017-07-10 14:17:32|
|  그림이랑 개 똑같이 생겼엌ㅋㅋㅋ...| 327623979|   335|   true|urmy****|     오오오오|      8858|        0|1.686477500056887E9|  616239|          55|2017-07-10 14:16:01|
|                 사지말고 입양하세요| 327626850|   335|   true|ghn0****|         스디|      7839|        0|1.686477502453198E9|  616239|         126|2017-07-10 14:19:05|
|  여러분 이번화 보다가 생각났는

In [25]:
df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- comment_id: integer (nullable = true)
 |-- epi_no: integer (nullable = true)
 |-- is_best: boolean (nullable = true)
 |-- login_id: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- recomm_cnt: integer (nullable = true)
 |-- reply_cnt: string (nullable = true)
 |-- save_date: double (nullable = true)
 |-- title_id: integer (nullable = true)
 |-- unrecomm_cnt: integer (nullable = true)
 |-- write_date: timestamp (nullable = true)



In [26]:
row_count = df.count()
print("총 row 개수:", row_count)



총 row 개수: 563197


                                                                                

## 데이터 품질 검사

### 결측치 검사

In [45]:
from pyspark.sql.functions import isnan, when, count, col, isnull

df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()



+-------+----------+------+-------+--------+--------+----------+---------+---------+--------+------------+----------+
|comment|comment_id|epi_no|is_best|login_id|nickname|recomm_cnt|reply_cnt|save_date|title_id|unrecomm_cnt|write_date|
+-------+----------+------+-------+--------+--------+----------+---------+---------+--------+------------+----------+
|      0|         0|     0|      0|       0|       0|         0|        0|        0|       0|           0|         0|
+-------+----------+------+-------+--------+--------+----------+---------+---------+--------+------------+----------+



                                                                                

# 1. ✅ 분석: 답글(unrecomm_cnt) 수

## 데이터 검사

In [27]:
from pyspark.sql.functions import desc

result = df.groupBy('reply_cnt').count().orderBy(desc('reply_cnt'))

In [28]:
result.show()



+---------+-----+
|reply_cnt|count|
+---------+-----+
|     999+|   10|
|      997|    1|
|       99|   25|
|       98|   45|
|       97|   48|
|       96|   71|
|       95|   67|
|       94|   67|
|      935|    1|
|       93|   62|
|       92|   61|
|      915|    1|
|       91|   82|
|       90|   67|
|        9|10874|
|       89|   66|
|      882|    1|
|       88|   83|
|      878|    1|
|       87|   78|
+---------+-----+
only showing top 20 rows



                                                                                

##### 문제 : 999+가 있다
==> 평균, 중앙값이 의미가 없어짐

#### 답글 수가 '999+' row 모두 (10개 출력하기)

In [35]:
# 'reply_cntr'가 '999+'인 row만 필터링
filtered_result = df.filter(df.reply_cnt == '999+')

# 결과 출력
filtered_result.show()



+----------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|                           comment|comment_id|epi_no|is_best|login_id|     nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+----------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
| 베댓되면 재입대함 ㅋㅋㅋ 차피 ...| 431614938|   391|   true|ddol****|         똘이|     30492|     999+|1.686220511149621E9|  570503|        6354|2022-01-12 14:01:39|
|     여기까지 지르신분 손! (1/1...| 435204933|   384|   true|sihu****|          sih|     16073|     999+|1.686333772584826E9|  602910|         135|2022-03-13 13:18:10|
|  찬양파는 시끄럽게 손을들어주세요| 428963723|   298|   true|alfk****|     alfk****|     84434|     999+|1.686592488559024E9|  654774|         149|2021-11-28 13:59:00|
|정보) 몇년도 9월이라 하지는 않았다| 41

                                                                                

 #### 사전 작업

In [18]:
# from pyspark.sql.functions import col, avg, expr, desc
     
# df = df.withColumn("reply_cnt", col("reply_cnt").cast("integer"))

## 1. 답글 수의 평균/중간값 => 삭제

### 1) 전체 답글 수의 평균/중간값

#### 평균

In [115]:
# avg_reply_cnt = df.select(
#     avg("reply_cnt").alias("avg_reply_cnt")
# ).selectExpr(
#     "avg_reply_cnt"
# )

In [116]:
# avg_reply_cnt.show()

#### 중간값

In [117]:
# from pyspark.sql.functions import expr

# # 'reply_cnt'을 기준으로 데이터프레임을 정렬
# sorted_df = df.orderBy("reply_cnt")

# # 중간 위치의 값을 계산
# n = sorted_df.count()
# middle_index = (n - 1) // 2

# # 중간 위치의 값을 가져옴
# median_reply_cnt = sorted_df.select("reply_cnt").limit(1).offset(middle_index).first()[0]

# # 결과 출력
# print("중간값:", median_reply_cnt)

###  2) 답글 수의 평균이 높은 웹툰 Top10

In [118]:
# top10_reply_by_toon= df.groupBy("title_id").agg(
#                             avg("reply_cnt").alias("avg_reply_cnt")
#                         ).orderBy(desc("avg_reply_cnt")) \
#                         .limit(10)
     

In [119]:
# top10_reply_by_toon.show()

# 2. ✅ 분석: '댓글'의 좋아요/싫어요 수

## 데이터 검사

> 댓글 좋아요/싫어요 수에 따른 집계와 카운트

In [46]:
from pyspark.sql.functions import desc

recomm_df = df.groupBy('recomm_cnt').count().orderBy(desc('recomm_cnt'))
unrecomm_df = df.groupBy('unrecomm_cnt').count().orderBy(desc('unrecomm_cnt'))

In [47]:
recomm_df.show()



+----------+-----+
|recomm_cnt|count|
+----------+-----+
|    359592|    1|
|    355467|    1|
|    266166|    1|
|    255071|    1|
|    251580|    1|
|    248670|    1|
|    246699|    1|
|    246316|    1|
|    243911|    1|
|    234963|    1|
|    234250|    1|
|    228812|    1|
|    224747|    1|
|    223687|    1|
|    223413|    1|
|    221281|    1|
|    219820|    1|
|    216406|    1|
|    215100|    1|
|    214994|    1|
+----------+-----+
only showing top 20 rows



[Stage 56:>                                                         (0 + 1) / 1]                                                                                

In [48]:
unrecomm_df.show()



+------------+-----+
|unrecomm_cnt|count|
+------------+-----+
|       28473|    1|
|       26629|    1|
|       26368|    1|
|       25472|    1|
|       23427|    1|
|       22134|    1|
|       21094|    1|
|       20904|    1|
|       20549|    1|
|       19620|    1|
|       18914|    1|
|       18719|    1|
|       18510|    1|
|       18411|    1|
|       18260|    1|
|       17921|    1|
|       17768|    1|
|       17688|    1|
|       17528|    1|
|       17474|    1|
+------------+-----+
only showing top 20 rows



                                                                                

### 사전 작업 (cast)

 > 모든 칼럼이 integer 형인지 확인하고 cast연산 진행함

In [49]:
from pyspark.sql.functions import col

# 'recomm_cnt' 칼럼이 모두 integer인지 확인
all_integer_recomm = df.select(col("recomm_cnt").cast("string").rlike("^[0-9]+$").alias("is_integer")).agg({"is_integer": "min"}).collect()[0][0]

# 결과 출력
if all_integer_recomm:
    print("recomm_cnt 칼럼은 모두 integer입니다.")
else:
    print("recomm_cnt 칼럼에 integer가 아닌 값이 포함되어 있습니다.")



recomm_cnt 칼럼은 모두 integer입니다.


                                                                                

In [50]:
from pyspark.sql.functions import col

# 'unrecomm_cnt' 칼럼이 모두 integer인지 확인
all_integer_unrecomm = df.select(col("unrecomm_cnt").cast("string").rlike("^[0-9]+$").alias("is_integer")).agg({"is_integer": "min"}).collect()[0][0]

# 결과 출력
if all_integer_unrecomm:
    print("unrecomm_cnt 칼럼은 모두 integer입니다.")
else:
    print("unrecomm_cnt 칼럼에 integer가 아닌 값이 포함되어 있습니다.")



unrecomm_cnt 칼럼은 모두 integer입니다.


                                                                                

 > cast 연산 진행 

In [51]:
df = df.withColumn("recomm_cnt", col("recomm_cnt").cast("integer"))
df = df.withColumn("unrecomm_cnt", col("unrecomm_cnt").cast("integer"))

## 1) 좋아요/싫어요 수 Top 10

In [52]:
top_10_recomm = df.orderBy(col("recomm_cnt").desc()).limit(10)
top_10_unrecomm = df.orderBy(col("unrecomm_cnt").desc()).limit(10)

In [53]:
top_10_recomm.show()

                                                                                

+----------------------------------+----------+------+-------+--------+--------+----------+---------+-------------------+--------+------------+-------------------+
|                           comment|comment_id|epi_no|is_best|login_id|nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+----------------------------------+----------+------+-------+--------+--------+----------+---------+-------------------+--------+------------+-------------------+
|                           귀여워♡| 233835400|    62|   true|jayo****|  아리엘|    359592|       24|1.684985739476716E9|  641253|        8440|2016-01-21 14:19:43|
|       좋아요 2번 누르면 모양 바뀜| 338536170|   204|   true|bowl****|  김승언|    355467|       56|1.686201781727034E9|  570503|       25472|2017-12-27 14:28:17|
|                모두 외쳐라 경우갓| 323789708|   172|   true|qkra****|      AA|    266166|      158|1.686198701711273E9|  570503|        1476|2017-05-17 14:15:37|
|재열이 이거 게이끼 슬슬 의심해야댐| 223508780|    47|   tr

In [54]:
top_10_unrecomm.show()



+----------------------------------+----------+------+-------+--------+-----------+----------+---------+-------------------+--------+------------+-------------------+
|                           comment|comment_id|epi_no|is_best|login_id|   nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+----------------------------------+----------+------+-------+--------+-----------+----------+---------+-------------------+--------+------------+-------------------+
| 요새 박태준 작가님 만화 볼때면...| 329351085|   142|   true|dare****|   dare****|    138013|      144|1.684993771860395E9|  641253|       28473|2017-08-03 14:12:35|
| 경우 빨리 헤어지는거 찬성하는 ...| 217351120|    97|   true|namy****|게으른 연수|    221281|      138|1.686191405600801E9|  570503|       26629|2015-08-12 14:25:41|
|찍먹파들은 조용히 좋아요을 찍도록.| 209661007|    87|   true|syup****|   syup****|    246699|       70|1.686190439263785E9|  570503|       26368|2015-06-03 14:28:56|
|       좋아요 2번 누르면 모양 바뀜| 338536170|   204| 

                                                                                

## 1) 좋아요/싫어요 수 평균

In [56]:
avg_recomm_cnt = df.select(
    avg("recomm_cnt").alias("avg_recomm_cnt")
).selectExpr(
    "avg_recomm_cnt"
)

avg_unrecomm_cnt = df.select(
    avg("unrecomm_cnt").alias("avg_unrecomm_cnt")
).selectExpr(
    "avg_unrecomm_cnt"
)

In [57]:
avg_recomm_cnt.show()



+-----------------+
|   avg_recomm_cnt|
+-----------------+
|4281.791833053088|
+-----------------+



                                                                                

In [58]:
avg_unrecomm_cnt.show()



+-----------------+
| avg_unrecomm_cnt|
+-----------------+
|73.53287570778964|
+-----------------+



                                                                                

## 2) 좋아요/싫어요 수의 평균이 높은 웹툰 Top10

In [59]:
top10_recomm_by_toon= df.groupBy("title_id").agg(
                            avg("recomm_cnt").alias("avg_recomm_cnt")
                        ).orderBy(desc("avg_recomm_cnt")) \
                        .limit(10)

top10_unrecomm_by_toon= df.groupBy("title_id").agg(
                            avg("unrecomm_cnt").alias("avg_unrecomm_cnt")
                        ).orderBy(desc("avg_unrecomm_cnt")) \
                        .limit(10)

In [60]:
top10_recomm_by_toon.show()



+--------+------------------+
|title_id|    avg_recomm_cnt|
+--------+------------------+
|  570503|32024.725303030304|
|  641253|23554.643445692884|
|  654774|19117.803224624404|
|  703846| 17314.54865900383|
|  738487| 16173.73950617284|
|  735661| 15402.57932446264|
|  597447|14628.861843932242|
|  183559| 13753.60058685446|
|  761722|13734.302970297029|
|  796152|13533.060453400503|
+--------+------------------+



                                                                                

In [61]:
top10_unrecomm_by_toon.show()



+--------+------------------+
|title_id|  avg_unrecomm_cnt|
+--------+------------------+
|  570503| 853.3406060606061|
|  641253| 727.1459176029963|
|  703846| 524.3412515964241|
|  597447|  458.673701749514|
|  183559| 395.6681924882629|
|  648419|304.67463203463205|
|  654774|246.05844631733237|
|  602910|229.19415316174135|
|  736277|203.65136612021857|
|  552960| 179.0732075471698|
+--------+------------------+



                                                                                

# ️3. ✅ 분석: 댓글의 길이

## 0. 전체 댓글의 길이의 평균

In [62]:
from pyspark.sql.functions import avg, length

avg_comment_length = df.select(avg(length("comment")).alias("avg_comment_length"))

In [63]:
avg_comment_length.show()



+------------------+
|avg_comment_length|
+------------------+
|58.274910910391924|
+------------------+



                                                                                

## 1. 가장 짧은 댓글 or 가장 긴 댓글 

In [68]:
desc_comm_len = df.orderBy(length(col("comment")).desc()).limit(10)
asc_comm_len = df.orderBy(length(col("comment")).asc()).limit(10)

In [71]:
desc_comm_len.show(truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [72]:
asc_comm_len.show(truncate=False)



+-------+----------+------+-------+--------+----------+----------+---------+-------------------+--------+------------+-------------------+
|comment|comment_id|epi_no|is_best|login_id|nickname  |recomm_cnt|reply_cnt|save_date          |title_id|unrecomm_cnt|write_date         |
+-------+----------+------+-------+--------+----------+----------+---------+-------------------+--------+------------+-------------------+
|?      |236629978 |5     |true   |hong****|BEST홍삼  |1248      |1        |1.68304517958E9    |671421  |113         |2016-02-16 22:23:58|
|ㅋ     |452744535 |15    |true   |kwc0****|kwc       |50        |2        |1.684080182192587E9|805691  |4           |2023-04-01 13:57:07|
|네     |451569706 |13    |true   |lee1****|노래쟁이  |6         |0        |1.686383389653226E9|804989  |0           |2023-03-01 22:06:03|
|식     |443573437 |2     |true   |sg24****|온리원스타|70        |10       |1.685298620191948E9|799250  |12          |2022-08-29 13:59:34|
|ㅠ     |449402642 |3     |true   |lee1***

                                                                                

# 4. ✅ 분석: 댓글 상세 분석 (전체 베댓 기준)

## 사전 확인

### ㅎ 혹은 ㅋ 사용하는 댓글 뽑아보기

In [73]:
filtered_h= df.filter(col("comment").contains('ㅎ'))

In [75]:
filtered_h.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+------+-------+--------+-----------+----------+---------+-------------------+--------+-----

In [77]:
filtered_k= df.filter(col("comment").contains('ㅋ'))

In [78]:
filtered_k.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+------+-------+--------+------------+----------+---------+-------------------+--------+----

In [85]:
# test: ㅎ 가 없는 댓글 추출해보기 => ㅎ 없는 것만 나옴. 필터링 조건 잘 맞게 건거 확인함 
filtered_no_h = df.filter(~col("comment").contains('ㅎ'))

# 결과 출력
filtered_no_h.show()


+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|                             comment|comment_id|epi_no|is_best|login_id|     nickname|recomm_cnt|reply_cnt|          save_date|title_id|unrecomm_cnt|         write_date|
+------------------------------------+----------+------+-------+--------+-------------+----------+---------+-------------------+--------+------------+-------------------+
|      넌 아직 건너오면 안된다... ...| 327625429|   335|   true|asd5****|케이지 사랑해|     12790|        2|1.686477496450047E9|  616239|          50|2017-07-10 14:17:32|
|  그림이랑 개 똑같이 생겼엌ㅋㅋㅋ...| 327623979|   335|   true|urmy****|     오오오오|      8858|        0|1.686477500056887E9|  616239|          55|2017-07-10 14:16:01|
|                 사지말고 입양하세요| 327626850|   335|   true|ghn0****|         스디|      7839|        0|1.686477502453198E9|  616239|         126|2017-07-10 14:19:05|
|  여러분 이번화 보다가 생각났는

## 1) ㅎ 혹은 ㅋ 사용 빈도

### 1-1) ㅎ/ㅋ 사용하는 정도 알아보기

In [79]:
count_with_h = df.filter(col("comment").contains('ㅎ')).count()
count_with_k = df.filter(col("comment").contains('ㅋ')).count()

                                                                                

In [86]:
print(f"댓글에 'ㅎ'가 들어있는 데이터 개수 (개수/전체): {count_with_h}/ {row_count} 비율 :  {(count_with_h/row_count)*100} " )
print(f"댓글에 'ㅋ'가 들어있는 데이터 개수 (개수/전체): {count_with_k}/ {row_count} 비율 :  {(count_with_k/row_count)*100} " )

댓글에 'ㅎ'가 들어있는 데이터 개수 (개수/전체): 8750/ 563197 비율 :  1.5536304348212082 
댓글에 'ㅋ'가 들어있는 데이터 개수 (개수/전체): 95817/ 563197 비율 :  17.01305227123014 


### 1-2) ㅎ/ㅋ 몇 개씩 쓸까

> ㅎ'의 개수를 집계

In [101]:
from pyspark.sql.functions import length, col, desc

# 'ㅎ'의 개수를 집계하는 칼럼 추가
df_with_h_count = df.withColumn('h_count', length(df.comment) - length(regexp_replace(df.comment, 'ㅎ', '')))
# 'ㅋ'의 개수를 집계하는 칼럼 추가
df_with_k_count = df.withColumn('k_count', length(df.comment) - length(regexp_replace(df.comment, 'ㅋ', '')))

#### 상세 결과 출력

In [100]:
# 상세 결과 출력
df_with_h_count.orderBy(desc('h_count')).show(truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [102]:
# 상세 결과 출력
df_with_k_count.orderBy(desc('k_count')).show(truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

#### 최대 개수 
- 'ㅎ'를 사용한 개수에 따른 집계
- 'ㅋ'를 사용한 개수에 따른 집계 

In [109]:
gr_df_with_h_count = df_with_h_count.groupBy('h_count').count()
gr_df_with_k_count = df_with_k_count.groupBy('k_count').count()

In [110]:
gr_df_with_h_count.orderBy(desc('h_count')).show()



+-------+-----+
|h_count|count|
+-------+-----+
|    500|    1|
|    432|    1|
|    103|    1|
|     92|    1|
|     57|    1|
|     56|    1|
|     51|    1|
|     47|    2|
|     40|    1|
|     39|    1|
|     36|    1|
|     35|    2|
|     33|    1|
|     32|    1|
|     31|    2|
|     30|    1|
|     25|    2|
|     24|    2|
|     23|    1|
|     22|    2|
+-------+-----+
only showing top 20 rows



                                                                                

In [112]:
gr_df_with_h_count.orderBy('h_count').show()



+-------+------+
|h_count| count|
+-------+------+
|      0|554447|
|      1|  2532|
|      2|  4236|
|      3|   873|
|      4|   519|
|      5|   225|
|      6|   135|
|      7|    57|
|      8|    48|
|      9|    21|
|     10|    18|
|     11|    15|
|     12|    12|
|     13|    11|
|     14|     5|
|     15|     2|
|     16|     4|
|     17|     2|
|     18|     4|
|     19|     4|
+-------+------+
only showing top 20 rows



                                                                                

In [114]:
gr_df_with_k_count.orderBy(desc('k_count')).show()



+-------+-----+
|k_count|count|
+-------+-----+
|    500|   20|
|    499|    2|
|    497|    1|
|    496|    1|
|    494|    1|
|    493|    1|
|    491|    2|
|    490|    1|
|    489|    1|
|    488|    1|
|    483|    1|
|    482|    1|
|    481|    1|
|    480|    2|
|    476|    1|
|    474|    1|
|    464|    1|
|    458|    1|
|    444|    1|
|    436|    1|
+-------+-----+
only showing top 20 rows



                                                                                

In [113]:
gr_df_with_k_count.orderBy('k_count').show()



+-------+------+
|k_count| count|
+-------+------+
|      0|467380|
|      1|  2709|
|      2| 15768|
|      3| 14331|
|      4| 12691|
|      5|  9706|
|      6|  7725|
|      7|  5762|
|      8|  4692|
|      9|  3656|
|     10|  3022|
|     11|  2373|
|     12|  1953|
|     13|  1551|
|     14|  1351|
|     15|  1093|
|     16|   928|
|     17|   764|
|     18|   615|
|     19|   551|
+-------+------+
only showing top 20 rows



                                                                                

## 2) 어떤 특수 문자가 제일 많이 사용될까

In [142]:
from pyspark.sql.functions import explode, split, regexp_replace, count, desc

# 특수 문자 추출
special_chars = df.select(explode(split(regexp_replace(df.comment, r"[a-zA-Z가-힣ㄱ-ㅎㅏ-ㅣ\s\n0-9]", ""), "")).alias("special_char"))

# 특수 문자별 사용 횟수 집계

# 특수 문자별 사용 횟수 집계
char_counts = special_chars.groupBy("special_char").agg(count("*").alias("count"))

# 가장 많이 사용된 특수 문자 확인
most_common_char = char_counts.orderBy(desc("count"))

# 결과 출력
most_common_char.show()




+------------+-------+
|special_char|  count|
+------------+-------+
|           .|1068309|
|            | 214241|
|           ?| 188650|
|           !| 136279|
|           ,|  92134|
|           ⠀|  83068|
|           ⣿|  66304|
|           )|  30507|
|           (|  29281|
|           ~|  28710|
|           :|  28598|
|           '|  26515|
|           ;|  24013|
|           "|  21431|
|           -|  15546|
|          　|  14571|
|           ^|  10131|
|           /|   7296|
|           ️|   5951|
|           ❤|   5615|
+------------+-------+
only showing top 20 rows



                                                                                

## 3) 이모티콘 사용한 댓글 출력 

In [144]:
from pyspark.sql.functions import col, expr, explode, count

# 이모티콘을 포함하는 댓글 개수
emoji_pattern = r"[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\U0001F910-\U0001F96B\U0001F980-\U0001F9E0\U0001FA70-\U0001FAFF]"
emoji_count = df.filter(expr(f"comment rlike '{emoji_pattern}'")).count()

# 이모티콘 별 댓글 사용 횟수 집계
# emoji_data = df.withColumn("emoji", explode(expr(f"split(comment, '[^{emoji_pattern}]')"))) \
#     .filter(expr(f"emoji rlike '{emoji_pattern}'")) \
#     .groupBy("emoji").agg(count("*").alias("count"))

                                                                                

In [145]:
print(f"이모티콘을 포함하는 댓글 개수 (개수/전체): {emoji_count}/ {row_count} 비율 :  {(emoji_count/row_count)*100} " )
# emoji_data.show(truncate=False)

이모티콘을 포함하는 댓글 개수 (개수/전체): 7284/ 563197 비율 :  1.2933307528271634 


# 5. ✅ 분석: 웹툰 장르별 댓글 상세 분석

> 추후 진행