In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import tomotopy as tp
import collections

In [2]:
data_name = "2014_2023_ForeignMovie_summary_preprocess_top_bottom"
data = pd.read_csv(data_name+".csv")
data.isnull().sum()

순위         0
영화명        0
개봉일        0
매출액        0
매출액 점유율    0
관객수        0
스크린수       0
대표국적       0
국적         0
배급사        0
줄거리        0
장르         0
줄거리2       0
흥행여부       0
명사         0
dtype: int64

In [3]:
data = data[["순위", "영화명", "개봉일", "대표국적", "줄거리", "장르", "줄거리2", "명사", "흥행여부"]]

In [4]:
data.head(3)

Unnamed: 0,순위,영화명,개봉일,대표국적,줄거리,장르,줄거리2,명사,흥행여부
0,1,겨울왕국,2014-01-16,미국,얼어붙은 세상을 녹일 자매가 온다! 서로가 최고의 친구였던 자매 ‘엘사’와 ‘안나...,"애니메이션,어드벤처,가족,코미디,뮤지컬,판타지",얼어붙은 세상을 녹일 자매가 온다 서로가 최고의 친구였던 자매 엘사와 안나 하지만...,"세상,자매,서로,최고,친구,자매,엘사,언니,엘사,동생,말,비밀,신비,힘,그것,엘사,...",1
1,2,인터스텔라,2014-11-06,미국,"“우린 답을 찾을 거야, 늘 그랬듯이” 세계 각국의 정부와 경제가 완전히 붕괴된 미...",SF,우린 답을 찾을 거야 늘 그랬듯이 세계 각국의 정부와 경제가 완전히 붕괴된 미래가 ...,"우리,답,세계,각국,정부,경제,붕괴,미래,세기,잘못,세계,식량,부족,해체,이때,시공...",1
2,3,트랜스포머: 사라진 시대,2014-06-25,미국,트랜스포머의 시대는 끝났다! 시카고에서 벌어진 오토봇과 디셉티콘의 전투로 인해 수...,"액션,SF,어드벤처",트랜스포머의 시대는 끝났다 시카고에서 벌어진 오토봇과 디셉티콘의 전투로 인해 수많...,"트랜스포머,시대,시카고,오토봇,디셉티콘,전투,사상자,발생,도시,파괴,정부,일부,오토...",1


#### Filtering Words

In [5]:
data["명사"] = data["명사"].progress_map(lambda x:x.split(","))
data["명사"] = data["명사"].progress_map(lambda x:[s for s in x if len(s) > 1])

100%|██████████| 6718/6718 [00:00<00:00, 42440.47it/s]
100%|██████████| 6718/6718 [00:00<00:00, 19923.63it/s]


In [6]:
total_words = data["명사"].sum()

In [7]:
words_count = collections.Counter(total_words)
words_count = dict(words_count)
words_count = sorted(words_count.items(), key=lambda x:x[1], reverse=True)
len(words_count)

21613

In [8]:
def filter_words(data, column, min_count, min_number):
    stopwords = []
    for word, value in words_count:  # minimum of occuring for one word in all documents
        if value <= min_count:
            stopwords.append(word)
    
    data[column] = data[column].progress_map(lambda x:[w for w in x if w not in stopwords])
    data["단어개수"] = data[column].apply(lambda x:len(x))

    data = data[data["단어개수"] >= min_number]  # minimum of number of total words in one document
    return data

#### Topic Modeling

In [9]:
def lda(k, iteration, text, word_remove=0):
    model = tp.LDAModel(k=k, rm_top=word_remove, seed=42)
    
    for line in text:
        line = str(line).split(",")
        model.add_doc(line)
    
    model.burn_in = 100
    model.train(0)
    
    # print(f"토픽 개수: {k}, 문서 개수: {len(model.docs)}, 단어 개수:, {len(model.used_vocabs)}, 단어의 총수: {model.num_words}")
    # print(f"제거된 단어들: {model.removed_top_words}")
    
    # print("훈련 중...",flush=True)
    for i in range(0, iteration, 10):
        model.train(10)

    #model.summary()
    return(model, model.ll_per_word)

In [10]:
k_list = [x for x in range(4, 31, 2)]

In [11]:
def perplex_coherence_graph(data, column):
  perco_df = pd.DataFrame(columns=["k", "perplexity", "coherence"])
  for k in k_list[1:4]:
    print(f"---training for k: {k}---")
    min_dict = {}
    for min_count in range(5, 60, 5):  # min_count range
      for min_number in range(5, 20, 5):  # min_number range
        print(f"---training for min_count: {min_count}, min_number: {min_number}---")
        key_name = f"{min_count}_{min_number}"

        data = filter_words(data, column, min_count, min_number)
        if len(data) < 100:
          continue

        model, log_score = lda(k=k, iteration=1000, text=data[column])
        min_dict[key_name] = log_score
        model_name = f"topic-{k}_model_{min_count}_{min_number}"
        model.save(f"./tmp_model/{model_name}.bin")
    
    min_key = min(min_dict, key=min_dict.get)
    min_key = str(min_key)
    min_list = min_key.split("_")
    min_list = [x for x in min_list]
    
    print(f"word_Mean: {np.array(data['단어개수']).mean()}, word_Median: {np.median(np.array(data['단어개수']))}")
    print(f"---find minimum log-likelihood", end="")
    optimal_model_name = f"topic-{k}_model_{min_list[0]}_{min_list[1]}"
    model.load(f"./tmp_model/{optimal_model_name}.bin")
    print(f": {model.ll_per_word}---")

    for file in os.scandir("./tmp_model/"):  # remove all model files
      check_name = f"topic-{k}"
      if check_name in str(file.name):
        os.remove(file.path)
    model.save(f"./tmp_model/topic-{k}_{min_count}_{min_number}_model.bin")  # only save optimal model for each number of topic

  #   perplexity_score = model.perplexity
          
  #   coh = tp.coherence.Coherence(model, coherence="u_mass")
  #   coherence_score = coh.get_score()

  #   tmp = [[k, perplexity_score, coherence_score]]
  #   tmp_df = pd.DataFrame(tmp, columns=["k", "perplexity", "coherence"])
  #   perco_df = pd.concat([perco_df, tmp_df], ignore_index=True)
  # perco_df = perco_df.reset_index()
  # perco_df = perco_df.drop("index", axis=1)

  # fig, ax = plt.subplots(1, 2)
  # sns.lineplot(x="k", y="perplexity", data=perco_df, ax=ax[0])
  # sns.lineplot(x="k", y="coherence", data=perco_df, ax=ax[1])
  # plt.show()

In [12]:
def topic_model(data, column, k):
  model = lda(k, 1000, text=data[column])

  topic = pd.DataFrame()
  for i in range(k):
    temp = pd.DataFrame()
    temp = pd.DataFrame(model.get_topic_words(i, top_n=10))
    temp.columns = ["Topic"+str(i),"probs"+str(i)]
    temp = temp.reset_index()
    if (i==0):
        topic = pd.concat([topic, temp], ignore_index=True)
    else:
        topic = topic.merge(temp, left_on="index", right_on="index")

  new = pd.DataFrame()
  for line in model.docs:
      temp = pd.DataFrame(line.get_topic_dist()).T
      new = pd.concat([new, temp])

  new.columns = ["Topic"+ str(x) for x in range(k)]
  new = new.reset_index().drop(["index"], axis=1)
  raw = data.reset_index().drop(["index"], axis=1)
  data_df = raw.merge(new, left_index=True, right_index=True)
  data_df["Highest_Topic"]=data_df[["Topic"+ str(s) for s in range(k)]].idxmax(axis=1)

  for i in range(k):
    print(f"--Topic{str(i)}--")
    count = len(data_df[data_df["Highest_Topic"]=="Topic"+str(i)])
    print(f"토픽 개수 : {count:8}, 전체 퍼센트 : {count/len(raw)*100:8.3f}")
    print(topic["Topic"+str(i)])
    print(data_df[data_df["Highest_Topic"]=="Topic"+str(i)]["줄거리"].sample(n=min(5, count)).values)

In [13]:
perplex_coherence_graph(data, "명사")

---training for k: 6---
---training for min_count: 5, min_number: 5---


  0%|          | 0/6718 [00:00<?, ?it/s]

100%|██████████| 6718/6718 [11:19<00:00,  9.89it/s]
  model.train(0)
  model.train(10)


---training for min_count: 5, min_number: 10---


100%|██████████| 6647/6647 [06:12<00:00, 17.86it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column] = data[column].progress_map(lambda x:[w for w in x if w not in stopwords])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["단어개수"] = data[column].apply(lambda x:len(x))


---training for min_count: 5, min_number: 15---


100%|██████████| 6208/6208 [01:56<00:00, 53.42it/s] 


---training for min_count: 10, min_number: 5---


100%|██████████| 5728/5728 [01:37<00:00, 58.60it/s]


---training for min_count: 10, min_number: 10---


100%|██████████| 5728/5728 [07:28<00:00, 12.78it/s]


---training for min_count: 10, min_number: 15---


100%|██████████| 5724/5724 [04:33<00:00, 20.94it/s]


---training for min_count: 15, min_number: 5---


100%|██████████| 5535/5535 [04:21<00:00, 21.20it/s]


---training for min_count: 15, min_number: 10---


100%|██████████| 5535/5535 [04:16<00:00, 21.62it/s]


---training for min_count: 15, min_number: 15---


100%|██████████| 5534/5534 [04:40<00:00, 19.71it/s]


---training for min_count: 20, min_number: 5---


100%|██████████| 5357/5357 [04:57<00:00, 18.00it/s]


---training for min_count: 20, min_number: 10---


100%|██████████| 5357/5357 [04:29<00:00, 19.84it/s]


---training for min_count: 20, min_number: 15---


100%|██████████| 5357/5357 [02:50<00:00, 31.34it/s]


---training for min_count: 25, min_number: 5---


100%|██████████| 5200/5200 [02:42<00:00, 32.07it/s]


---training for min_count: 25, min_number: 10---


100%|██████████| 5200/5200 [02:32<00:00, 34.11it/s]


---training for min_count: 25, min_number: 15---


100%|██████████| 5200/5200 [02:41<00:00, 32.14it/s]


---training for min_count: 30, min_number: 5---


100%|██████████| 5052/5052 [02:28<00:00, 33.92it/s]


---training for min_count: 30, min_number: 10---


100%|██████████| 5052/5052 [02:32<00:00, 33.08it/s]


---training for min_count: 30, min_number: 15---


100%|██████████| 5052/5052 [02:32<00:00, 33.20it/s]


---training for min_count: 35, min_number: 5---


100%|██████████| 4873/4873 [02:24<00:00, 33.66it/s]


---training for min_count: 35, min_number: 10---


100%|██████████| 4873/4873 [02:23<00:00, 33.91it/s]


---training for min_count: 35, min_number: 15---


100%|██████████| 4873/4873 [02:23<00:00, 34.04it/s]


---training for min_count: 40, min_number: 5---


100%|██████████| 4707/4707 [02:14<00:00, 35.08it/s]


---training for min_count: 40, min_number: 10---


100%|██████████| 4707/4707 [02:16<00:00, 34.48it/s]


---training for min_count: 40, min_number: 15---


100%|██████████| 4707/4707 [02:14<00:00, 35.05it/s]


---training for min_count: 45, min_number: 5---


100%|██████████| 4566/4566 [02:05<00:00, 36.38it/s]


---training for min_count: 45, min_number: 10---


100%|██████████| 4566/4566 [02:05<00:00, 36.32it/s]


---training for min_count: 45, min_number: 15---


100%|██████████| 4566/4566 [02:13<00:00, 34.20it/s]


---training for min_count: 50, min_number: 5---


100%|██████████| 4384/4384 [02:23<00:00, 30.64it/s]


---training for min_count: 50, min_number: 10---


100%|██████████| 4384/4384 [01:59<00:00, 36.68it/s]


---training for min_count: 50, min_number: 15---


100%|██████████| 4384/4384 [02:08<00:00, 34.24it/s]


---training for min_count: 55, min_number: 5---


100%|██████████| 4280/4280 [01:59<00:00, 35.86it/s]


---training for min_count: 55, min_number: 10---


100%|██████████| 4280/4280 [02:12<00:00, 32.35it/s]


---training for min_count: 55, min_number: 15---


100%|██████████| 4279/4279 [01:57<00:00, 36.35it/s]


word_Mean: 23.92450073063809, word_Median: 22.0
---find minimum log-likelihood: -6.506247190855554---
---training for k: 8---
---training for min_count: 5, min_number: 5---


100%|██████████| 4106/4106 [00:57<00:00, 71.58it/s]


---training for min_count: 5, min_number: 10---


100%|██████████| 4106/4106 [01:12<00:00, 56.56it/s] 


---training for min_count: 5, min_number: 15---


100%|██████████| 4106/4106 [01:21<00:00, 50.08it/s] 


---training for min_count: 10, min_number: 5---


100%|██████████| 4106/4106 [01:55<00:00, 35.59it/s]


---training for min_count: 10, min_number: 10---


100%|██████████| 4106/4106 [01:38<00:00, 41.86it/s]


---training for min_count: 10, min_number: 15---


100%|██████████| 4106/4106 [02:41<00:00, 25.36it/s]


---training for min_count: 15, min_number: 5---


100%|██████████| 4106/4106 [03:45<00:00, 18.24it/s]


---training for min_count: 15, min_number: 10---


100%|██████████| 4106/4106 [01:34<00:00, 43.60it/s]


---training for min_count: 15, min_number: 15---


100%|██████████| 4106/4106 [01:33<00:00, 43.94it/s]


---training for min_count: 20, min_number: 5---


100%|██████████| 4106/4106 [01:48<00:00, 37.97it/s]


---training for min_count: 20, min_number: 10---


100%|██████████| 4106/4106 [01:25<00:00, 47.76it/s]


---training for min_count: 20, min_number: 15---


100%|██████████| 4106/4106 [01:34<00:00, 43.66it/s]


---training for min_count: 25, min_number: 5---


100%|██████████| 4106/4106 [01:37<00:00, 41.95it/s]


---training for min_count: 25, min_number: 10---


100%|██████████| 4106/4106 [01:47<00:00, 38.12it/s]


---training for min_count: 25, min_number: 15---


100%|██████████| 4106/4106 [01:43<00:00, 39.52it/s]


---training for min_count: 30, min_number: 5---


100%|██████████| 4106/4106 [01:59<00:00, 34.30it/s]


---training for min_count: 30, min_number: 10---


100%|██████████| 4106/4106 [01:43<00:00, 39.58it/s]


---training for min_count: 30, min_number: 15---


100%|██████████| 4106/4106 [02:36<00:00, 26.19it/s]


---training for min_count: 35, min_number: 5---


100%|██████████| 4106/4106 [02:59<00:00, 22.82it/s]


---training for min_count: 35, min_number: 10---


100%|██████████| 4106/4106 [01:29<00:00, 45.67it/s]


---training for min_count: 35, min_number: 15---


100%|██████████| 4106/4106 [02:11<00:00, 31.17it/s]


---training for min_count: 40, min_number: 5---


100%|██████████| 4106/4106 [01:23<00:00, 48.99it/s]


---training for min_count: 40, min_number: 10---


100%|██████████| 4106/4106 [01:23<00:00, 48.89it/s]


---training for min_count: 40, min_number: 15---


100%|██████████| 4106/4106 [02:26<00:00, 28.01it/s]


---training for min_count: 45, min_number: 5---


100%|██████████| 4106/4106 [02:18<00:00, 29.69it/s]


---training for min_count: 45, min_number: 10---


100%|██████████| 4106/4106 [01:20<00:00, 51.21it/s]


---training for min_count: 45, min_number: 15---


100%|██████████| 4106/4106 [01:13<00:00, 56.11it/s]


---training for min_count: 50, min_number: 5---


100%|██████████| 4106/4106 [01:13<00:00, 56.15it/s]


---training for min_count: 50, min_number: 10---


100%|██████████| 4106/4106 [01:10<00:00, 58.23it/s]


---training for min_count: 50, min_number: 15---


100%|██████████| 4106/4106 [01:03<00:00, 64.23it/s]


---training for min_count: 55, min_number: 5---


100%|██████████| 4106/4106 [01:04<00:00, 63.52it/s]


---training for min_count: 55, min_number: 10---


100%|██████████| 4106/4106 [01:04<00:00, 63.85it/s]


---training for min_count: 55, min_number: 15---


100%|██████████| 4106/4106 [01:04<00:00, 63.99it/s]


word_Mean: 23.92450073063809, word_Median: 22.0
---find minimum log-likelihood: -6.499244533792455---
---training for k: 10---
---training for min_count: 5, min_number: 5---


100%|██████████| 4106/4106 [00:52<00:00, 78.11it/s] 


---training for min_count: 5, min_number: 10---


100%|██████████| 4106/4106 [01:44<00:00, 39.20it/s] 


---training for min_count: 5, min_number: 15---


100%|██████████| 4106/4106 [01:46<00:00, 38.64it/s]


---training for min_count: 10, min_number: 5---


100%|██████████| 4106/4106 [01:40<00:00, 40.90it/s]


---training for min_count: 10, min_number: 10---


100%|██████████| 4106/4106 [02:00<00:00, 34.20it/s]


---training for min_count: 10, min_number: 15---


100%|██████████| 4106/4106 [01:43<00:00, 39.51it/s]


---training for min_count: 15, min_number: 5---


100%|██████████| 4106/4106 [01:48<00:00, 37.96it/s]


---training for min_count: 15, min_number: 10---


100%|██████████| 4106/4106 [01:49<00:00, 37.60it/s]


---training for min_count: 15, min_number: 15---


100%|██████████| 4106/4106 [02:10<00:00, 31.41it/s]


---training for min_count: 20, min_number: 5---


100%|██████████| 4106/4106 [01:57<00:00, 34.83it/s]


---training for min_count: 20, min_number: 10---


100%|██████████| 4106/4106 [02:02<00:00, 33.58it/s]


---training for min_count: 20, min_number: 15---


100%|██████████| 4106/4106 [01:58<00:00, 34.77it/s]


---training for min_count: 25, min_number: 5---


100%|██████████| 4106/4106 [02:01<00:00, 33.84it/s]


---training for min_count: 25, min_number: 10---


100%|██████████| 4106/4106 [01:53<00:00, 36.18it/s]


---training for min_count: 25, min_number: 15---


100%|██████████| 4106/4106 [01:53<00:00, 36.11it/s]


---training for min_count: 30, min_number: 5---


100%|██████████| 4106/4106 [01:56<00:00, 35.20it/s]


---training for min_count: 30, min_number: 10---


100%|██████████| 4106/4106 [01:57<00:00, 34.96it/s]


---training for min_count: 30, min_number: 15---


100%|██████████| 4106/4106 [01:56<00:00, 35.14it/s]


---training for min_count: 35, min_number: 5---


100%|██████████| 4106/4106 [02:02<00:00, 33.50it/s]


---training for min_count: 35, min_number: 10---


100%|██████████| 4106/4106 [02:13<00:00, 30.85it/s]


---training for min_count: 35, min_number: 15---


100%|██████████| 4106/4106 [06:56<00:00,  9.86it/s]


---training for min_count: 40, min_number: 5---


100%|██████████| 4106/4106 [09:07<00:00,  7.51it/s]


---training for min_count: 40, min_number: 10---


100%|██████████| 4106/4106 [09:09<00:00,  7.48it/s]


---training for min_count: 40, min_number: 15---


100%|██████████| 4106/4106 [09:14<00:00,  7.40it/s]


---training for min_count: 45, min_number: 5---


100%|██████████| 4106/4106 [08:10<00:00,  8.37it/s]


---training for min_count: 45, min_number: 10---


100%|██████████| 4106/4106 [08:43<00:00,  7.85it/s]


---training for min_count: 45, min_number: 15---


100%|██████████| 4106/4106 [02:46<00:00, 24.68it/s]


---training for min_count: 50, min_number: 5---


100%|██████████| 4106/4106 [01:07<00:00, 60.59it/s]


---training for min_count: 50, min_number: 10---


100%|██████████| 4106/4106 [01:08<00:00, 60.21it/s]


---training for min_count: 50, min_number: 15---


100%|██████████| 4106/4106 [01:08<00:00, 60.08it/s]


---training for min_count: 55, min_number: 5---


100%|██████████| 4106/4106 [01:07<00:00, 60.98it/s]


---training for min_count: 55, min_number: 10---


100%|██████████| 4106/4106 [01:10<00:00, 58.19it/s]


---training for min_count: 55, min_number: 15---


100%|██████████| 4106/4106 [01:03<00:00, 64.70it/s]


word_Mean: 23.92450073063809, word_Median: 22.0
---find minimum log-likelihood: -6.526841797780886---
