In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import tomotopy as tp
import collections

In [2]:
data_name = "2014_2023_ForeignMovie_summary_preprocess_top_bottom"
data = pd.read_csv(data_name+".csv")
data.isnull().sum()

순위         0
영화명        0
개봉일        0
매출액        0
매출액 점유율    0
관객수        0
스크린수       0
대표국적       0
국적         0
배급사        0
줄거리        0
장르         0
줄거리2       0
흥행여부       0
명사         0
dtype: int64

In [3]:
data = data[["순위", "영화명", "개봉일", "대표국적", "줄거리", "장르", "줄거리2", "명사", "흥행여부"]]

In [4]:
data.head(3)

Unnamed: 0,순위,영화명,개봉일,대표국적,줄거리,장르,줄거리2,명사,흥행여부
0,1,겨울왕국,2014-01-16,미국,얼어붙은 세상을 녹일 자매가 온다! 서로가 최고의 친구였던 자매 ‘엘사’와 ‘안나...,"애니메이션,어드벤처,가족,코미디,뮤지컬,판타지",얼어붙은 세상을 녹일 자매가 온다 서로가 최고의 친구였던 자매 엘사와 안나 하지만...,"세상,자매,서로,최고,친구,자매,엘사,언니,엘사,동생,말,비밀,신비,힘,그것,엘사,...",1
1,2,인터스텔라,2014-11-06,미국,"“우린 답을 찾을 거야, 늘 그랬듯이” 세계 각국의 정부와 경제가 완전히 붕괴된 미...",SF,우린 답을 찾을 거야 늘 그랬듯이 세계 각국의 정부와 경제가 완전히 붕괴된 미래가 ...,"우리,답,세계,각국,정부,경제,붕괴,미래,세기,잘못,세계,식량,부족,해체,이때,시공...",1
2,3,트랜스포머: 사라진 시대,2014-06-25,미국,트랜스포머의 시대는 끝났다! 시카고에서 벌어진 오토봇과 디셉티콘의 전투로 인해 수...,"액션,SF,어드벤처",트랜스포머의 시대는 끝났다 시카고에서 벌어진 오토봇과 디셉티콘의 전투로 인해 수많...,"트랜스포머,시대,시카고,오토봇,디셉티콘,전투,사상자,발생,도시,파괴,정부,일부,오토...",1


#### Filtering Words

In [5]:
data["명사"] = data["명사"].progress_map(lambda x:x.split(","))
data["명사"] = data["명사"].progress_map(lambda x:[s for s in x if len(s) > 1])

100%|██████████| 6718/6718 [00:00<00:00, 142543.02it/s]


100%|██████████| 6718/6718 [00:00<00:00, 49856.03it/s]


In [6]:
total_words = data["명사"].sum()

In [7]:
words_count = collections.Counter(total_words)
words_count = dict(words_count)
words_count = sorted(words_count.items(), key=lambda x:x[1], reverse=True)
len(words_count)

21613

In [8]:
def filter_words(data, column, min_count, min_number):
    stopwords = []
    for word, value in words_count:  # minimum of occuring for one word in all documents
        if value <= min_count:
            stopwords.append(word)
    
    data[column] = data[column].progress_map(lambda x:[w for w in x if w not in stopwords])
    data["단어개수"] = data[column].apply(lambda x:len(x))

    data = data[data["단어개수"] >= min_number]  # minimum of number of total words in one document
    return data

#### Topic Modeling

In [9]:
def lda(k, iteration, text, word_remove=0):
    model = tp.LDAModel(k=k, rm_top=word_remove, seed=42)
    
    for line in text:
        line = str(line).split(",")
        model.add_doc(line)
    
    model.burn_in = 100
    model.train(0)
    
    # print(f"토픽 개수: {k}, 문서 개수: {len(model.docs)}, 단어 개수:, {len(model.used_vocabs)}, 단어의 총수: {model.num_words}")
    # print(f"제거된 단어들: {model.removed_top_words}")
    
    # print("훈련 중...",flush=True)
    for i in range(0, iteration, 10):
        model.train(10)

    #model.summary()
    return(model, model.ll_per_word)

In [10]:
k_list = [x for x in range(2, 31, 2)]

In [14]:
def perplex_coherence_graph(data, column):
  perco_df = pd.DataFrame(columns=["k", "perplexity", "coherence"])
  for k in k_list:
    print(f"---training for k: {k}---")
    min_dict = {}
    for min_count in range(5, 30, 5):  # min_count range
      for min_number in range(10, 30, 5):  # min_number range
        print(f"---training for min_count: {min_count}, min_number: {min_number}---")
        key_name = f"{min_count}_{min_number}"

        data = filter_words(data, column, min_count, min_number)
        if len(data) < 100:
          continue

        model, log_score = lda(k=k, iteration=1000, text=data[column])
        min_dict[key_name] = log_score
        model_name = f"model_{min_count}_{min_number}"
        model.save(f"./tmp_model/{model_name}.bin")
    
    min_key = min(min_dict, key=min_dict.get)
    min_key = str(min_key)
    min_list = min_key.split("_")
    min_list = [x for x in min_list]
    
    print(f"word_Mean: {np.array(data['단어개수']).mean()}, word_Median: {np.median(np.array(data['단어개수']))}")
    print(f"---find minimum log-likelihood", end="")
    optimal_model_name = f"model_{min_list[0]}_{min_list[1]}"
    model.load(f"./tmp_model/{optimal_model_name}.bin")
    print(f": {model.ll_per_word}---")

    for file in os.scandir("./tmp_model/"):  # remove all model files
      os.remove(file.path)
    model.save(f"./tmp_model/topic-{k}_model.bin")  # only save optimal model for each number of topic

    perplexity_score = model.perplexity
          
    coh = tp.coherence.Coherence(model, coherence="u_mass")
    coherence_score = coh.get_score()

    tmp = [[k, perplexity_score, coherence_score]]
    tmp_df = pd.DataFrame(tmp, columns=["k", "perplexity", "coherence"])
    perco_df = pd.concat([perco_df, tmp_df], ignore_index=True)
  perco_df = perco_df.reset_index()
  perco_df = perco_df.drop("index", axis=1)

  fig, ax = plt.subplots(1, 2)
  sns.lineplot(x="k", y="perplexity", data=perco_df, ax=ax[0])
  sns.lineplot(x="k", y="coherence", data=perco_df, ax=ax[1])
  plt.show()

In [12]:
def topic_model(data, column, k):
  model = lda(k, 1000, text=data[column])

  topic = pd.DataFrame()
  for i in range(k):
    temp = pd.DataFrame()
    temp = pd.DataFrame(model.get_topic_words(i, top_n=10))
    temp.columns = ["Topic"+str(i),"probs"+str(i)]
    temp = temp.reset_index()
    if (i==0):
        topic = pd.concat([topic, temp], ignore_index=True)
    else:
        topic = topic.merge(temp, left_on="index", right_on="index")

  new = pd.DataFrame()
  for line in model.docs:
      temp = pd.DataFrame(line.get_topic_dist()).T
      new = pd.concat([new, temp])

  new.columns = ["Topic"+ str(x) for x in range(k)]
  new = new.reset_index().drop(["index"], axis=1)
  raw = data.reset_index().drop(["index"], axis=1)
  data_df = raw.merge(new, left_index=True, right_index=True)
  data_df["Highest_Topic"]=data_df[["Topic"+ str(s) for s in range(k)]].idxmax(axis=1)

  for i in range(k):
    print(f"--Topic{str(i)}--")
    count = len(data_df[data_df["Highest_Topic"]=="Topic"+str(i)])
    print(f"토픽 개수 : {count:8}, 전체 퍼센트 : {count/len(raw)*100:8.3f}")
    print(topic["Topic"+str(i)])
    print(data_df[data_df["Highest_Topic"]=="Topic"+str(i)]["줄거리"].sample(n=min(5, count)).values)

In [None]:
perplex_coherence_graph(data, "명사")