## 좋아요 수 높은순 comment

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col
import matplotlib.pyplot as plt
from matplotlib import font_manager
from wordcloud import WordCloud

# Spark 세션 생성
spark = SparkSession.builder.appName("comment_like").getOrCreate()

# CSV 파일을 읽어서 DataFrame 생성 (UTF-8로 인코딩된 파일이라 가정)
df = spark.read.csv("assembled_data.csv", header=True, inferSchema=True, encoding="EUC-KR")


# 'tag' 열에 따른 행 필터링
youtube_df = df.filter(df['tag'] == 'youtube')
instagram_df = df.filter(df['tag'] == 'instagram')
news_df = df.filter(df['tag'] == 'news')


In [40]:
from pyspark.sql.functions import desc
# 좋아요(like)가 높은 순으로 comment 상위 10개 가져오기
top_20_youtube_comments = youtube_df.select('comment', 'num_likes', 'tag', 'date') \
    .orderBy(desc('num_likes')) \
    .limit(20)

top_20_instagram_comments = instagram_df.select('comment', 'num_likes', 'tag', 'date') \
    .orderBy(desc('num_likes')) \
    .limit(20)

top_20_news_comments = news_df.select('comment', 'num_likes', 'tag', 'date') \
    .orderBy(desc('num_likes')) \
    .limit(20)

In [23]:
# 결과 출력
print("Top 20 YouTube Comments (by like count):")
top_20_youtube_comments.show(truncate=False)

print("Top 20 Instagram Comments (by like count):")
top_20_instagram_comments.show(truncate=False)

print("Top 20 News Comments (by like count):")
top_20_news_comments.show(truncate=False)

Top 20 YouTube Comments (by like count):
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|comment                                                                                                                                                                                                                                                                                                                                                                                                                                          |num_likes|
+------------------------------------------------------------------

In [28]:
# Political words 파일 로드
with open("political_words.txt", "r", encoding="utf-8") as file:
    political_words = file.read().splitlines()

In [33]:
# 좋아요(like)가 높은 순으로 political words를 포함하는 comment 상위 20개 가져오기
top_20_youtube_political_comments = youtube_df.filter(
    col("comment").rlike("|".join(political_words))
).select("comment", "num_likes", "date", "tag").orderBy(desc("num_likes")).limit(20)

top_20_instagram_political_comments = instagram_df.filter(
    col("comment").rlike("|".join(political_words))
).select("comment", "num_likes", "date", "tag").orderBy(desc("num_likes")).limit(20)

top_20_news_political_comments = news_df.filter(
    col("comment").rlike("|".join(political_words))
).select("comment", "num_likes", "date", "tag").orderBy(desc("num_likes")).limit(20)

In [32]:
# 결과 출력
print("Top 20 YouTube Political Comments (by like count):")
top_20_youtube_political_comments.show(truncate=False)

print("Top 20 Instagram Political Comments (by like count):")
top_20_instagram_political_comments.show(truncate=False)

print("Top 20 News Political Comments (by like count):")
top_20_news_political_comments.show(truncate=False)

Top 10 YouTube Political Comments (by like count):
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|comment                                                                                                                                                                                                                                                        |num_likes|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|지금 감히 문재인 대통령님의 환경애 대한 선구안적인 시선을 무시하는거냐                                                                                                         

In [41]:
# 결과 합치기
combined_results = top_20_youtube_comments.union(
    top_20_instagram_comments
).union(
    top_20_news_comments
)

In [43]:
# CSV 파일로 저장
combined_results.write.csv("combined", header=True)