### 1. 사용한 모듈

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen
import re
import os

### 2. PDF 파일 읽어오기

In [2]:
def read_pdf_file(pdfFile):
    pdfrm = PDFResourceManager()
    strio = StringIO()
    lapa = LAParams()
    device = TextConverter(pdfrm, strio, laparams = lapa)
    
    process_pdf(pdfrm, device, pdfFile)
    device.close()
    
    content = strio.getvalue()
    strio.close()
    return content

In [3]:
pdf_samsung = open("data/environment_S/LG생활건강.pdf", "rb")
a = read_pdf_file(pdf_samsung)
pdf_samsung.close() 

pdf_skens = open("data/environment_S/LG전자.pdf", "rb")
b = read_pdf_file(pdf_skens)
pdf_skens.close()

pdf_skens = open("data/environment_S/SK하이닉스.pdf", "rb")
c = read_pdf_file(pdf_skens)
pdf_skens.close()

pdf_skens = open("data/environment_S/삼성SDI.pdf", "rb")
d = read_pdf_file(pdf_skens)
pdf_skens.close()

pdf_skens = open("data/environment_S/삼성물산.pdf", "rb")
e = read_pdf_file(pdf_skens)
pdf_skens.close()



### 3. 문서 토큰화

In [4]:
from konlpy.tag import Mecab
mecab = Mecab()

tokens = []
def word_token (x) :
    for token in mecab.pos(x):
        tokens.append(token)
    return tokens

word_token(a)
word_token(b)
word_token(c)
word_token(d)
word_token(e)

[('2020', 'SN'),
 ('LG', 'SL'),
 ('생활', 'NNG'),
 ('건강', 'NNG'),
 ('ESG', 'SL'),
 ('보고서', 'NNG'),
 ('F', 'SL'),
 ('O', 'SL'),
 ('R', 'SL'),
 ('P', 'SL'),
 ('E', 'SL'),
 ('O', 'SL'),
 ('P', 'SL'),
 ('L', 'SL'),
 ('E', 'SL'),
 ('A', 'SL'),
 ('N', 'SL'),
 ('D', 'SL'),
 ('T', 'SL'),
 ('H', 'SL'),
 ('E', 'SL'),
 ('P', 'SL'),
 ('L', 'SL'),
 ('A', 'SL'),
 ('N', 'SL'),
 ('E', 'SL'),
 ('T', 'SL'),
 ('', 'SY'),
 ('', 'SY'),
 ('About', 'SL'),
 ('This', 'SL'),
 ('Report', 'SL'),
 ('보고서', 'NNG'),
 ('개요', 'NNG'),
 ('LG', 'SL'),
 ('생활', 'NNG'),
 ('건강', 'NNG'),
 ('은', 'JX'),
 ('2009', 'SN'),
 ('년', 'NNBC'),
 ('부터', 'JX'),
 ('매년', 'MAG'),
 ('ESG', 'SL'),
 ('보고서', 'NNG'),
 ('를', 'JKO'),
 ('발간', 'NNG'),
 ('하', 'XSV'),
 ('여', 'EC'),
 ('당사', 'NNG'),
 ('의', 'JKG'),
 ('지속', 'NNG'),
 ('가능', 'NNG'),
 ('경영', 'NNG'),
 ('을', 'JKO'),
 ('외', 'NNG'),
 ('부', 'NNG'),
 ('에', 'JKB'),
 ('공유', 'NNG'),
 ('하', 'XSV'),
 ('고', 'EC'),
 ('있', 'VX'),
 ('습니다', 'EF'),
 ('.', 'SF'),
 ('‘', 'SY'),
 ('2020', 'SN'),
 ('LG', 'SL'),
 ('생

### 4. 불용어 처리

In [5]:
from konlpy.tag import Mecab
mecab = Mecab()
stop = ["회사", "에서", "위해", "관련", "기준"]
# tokens = [token for token in mecab.pos(samsung_display)]

nodes = [t[0] for t in tokens]
vocab = [t[0] for t in tokens if t[0] not in stop if t[1] in ['NNG', 'NNP'] and len(t[0]) > 1]

print(nodes[:10])
print(vocab[:10])


['2020', 'LG', '생활', '건강', 'ESG', '보고서', 'F', 'O', 'R', 'P']
['생활', '건강', '보고서', '보고서', '개요', '생활', '건강', '보고서', '발간', '당사']


### 5. TF-IDF

In [9]:
vocab = list(set(vocab))

vocab2idx = {vocab[i]:i for i in range(len(vocab))}
idx2vocab = {i:vocab[i] for i in range(len(vocab))}

In [10]:
import numpy as np
import math

vocab_len = len(vocab2idx)

# 토큰별로 그래프 edge를 Matrix 형태로 생성
weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

# 각 토큰 노드별로 스코어 1로 초기화
score = np.ones((vocab_len),dtype=np.float32)

# coocurrence를 판단하기 위한 window 사이즈 설정
window_size = 4
covered_coocurrences = []

for window_start in range(len(nodes) - window_size + 1):
    window = nodes[window_start:window_start+window_size]
    for i in range(window_size):
        for j in range(i+1, window_size):
            if window[i] in vocab and window[j] in vocab:
                index_i = window_start + i
                index_j = window_start + j

                if (index_i, index_j) not in covered_coocurrences:
                    weighted_edge[vocab2idx[window[i]]][vocab2idx[window[j]]] = 1
                    weighted_edge[vocab2idx[window[j]]][vocab2idx[window[i]]] = 1
                    covered_coocurrences.append((index_i, index_j))

for i in range(vocab_len):
    row_sum = weighted_edge[i].sum()
    weighted_edge[i] = weighted_edge[i]/row_sum if row_sum > 0 else 0

MAX_ITERATIONS = 50
d=0.85
threshold = 0.0001 #convergence threshold

for iter in range(MAX_ITERATIONS):
    prev_score = np.copy(score)

    for i in range(vocab_len):
        summation = 0
        for j in range(vocab_len):
            if weighted_edge[j][i] != 0:
                summation += weighted_edge[j][i] * prev_score[j]

        score[i] = (1 - d) * d*summation

    if np.sum(np.fabs(prev_score -  score)) <= threshold:
        break


sorted_index = np.flip(np.argsort(score), 0)

n = 50


print("\n=== 핵심키워드 ===")
for i in range(0,n):
    print(str(idx2vocab[sorted_index[i]])+" : " + str(score[sorted_index[i]]))


=== 핵심키워드 ===
관리 : 3.4990844e-08
제품 : 3.251817e-08
환경 : 2.9501109e-08
협력 : 2.8964921e-08
사업 : 2.8506618e-08
활동 : 2.8314378e-08
고객 : 2.810443e-08
교육 : 2.791175e-08
안전 : 2.660436e-08
사회 : 2.643786e-08
운영 : 2.6219686e-08
지원 : 2.539117e-08
개선 : 2.4125612e-08
경영 : 2.3636774e-08
대상 : 2.3406127e-08
정보 : 2.280032e-08
평가 : 2.23878e-08
강화 : 2.2319396e-08
임직원 : 2.2299147e-08
전자 : 2.2285365e-08
가능 : 2.0647652e-08
기술 : 2.0190525e-08
지속 : 2.0133227e-08
사업장 : 2.011721e-08
기업 : 1.9961611e-08
부문 : 1.9881915e-08
사용 : 1.9752083e-08
한국 : 1.8912381e-08
개발 : 1.8834223e-08
건강 : 1.8710335e-08
다양 : 1.8353393e-08
생활 : 1.7318634e-08
주요 : 1.7007032e-08
활용 : 1.6942238e-08
현황 : 1.6879499e-08
확대 : 1.6829048e-08
지역 : 1.6790251e-08
글로벌 : 1.6782634e-08
에너지 : 1.5821024e-08
품질 : 1.5776575e-08
리스크 : 1.573659e-08
대응 : 1.5639374e-08
조직 : 1.5601254e-08
시스템 : 1.5142444e-08
공정 : 1.4890783e-08
구분 : 1.47747965e-08
물질 : 1.4724996e-08
해외 : 1.4536631e-08
점검 : 1.4458986e-08
업무 : 1.4301044e-08


### 6. DataFrame

In [34]:
w = []
v = []
for i in range(0, 100) :
    w.append(str(idx2vocab[sorted_index[i]]))
    v.append(str(score[sorted_index[i]]))

In [35]:
import pandas as pd

In [36]:
environment_df = pd.DataFrame({"keyword" : w,  "score" : v})

In [37]:
environment_df.to_csv("keyword.csv")

In [38]:
environment_df.astype({"score" : float})

Unnamed: 0,keyword,score
0,가출,2.521586e-08
1,극대,2.434415e-08
2,살균,2.283666e-08
3,일선,2.231143e-08
4,밀착력,2.137966e-08
...,...,...
95,사각지대,6.069865e-09
96,파티,6.060740e-09
97,태계,6.007694e-09
98,지정,5.920958e-09


### 7. Test Data

In [46]:
pdf_test = open("data/environment_S/삼성물산.pdf", "rb")
test = read_pdf_file(pdf_test)
pdf_test.close()



In [24]:
from konlpy.tag import Mecab
mecab = Mecab()

tokens_test = []
def word_token (x) :
    for token in mecab.pos(x):
        tokens_test.append(token)
    return tokens_test

word_token(test)

[('2020', 'SN'),
 ('SAMSUNG', 'SL'),
 ('C', 'SL'),
 ('&', 'SY'),
 ('T', 'SL'),
 ('CSR', 'SL'),
 ('REPORT', 'SL'),
 ('WILL', 'SL'),
 ('OFTOMORROW', 'SL'),
 ('HAPPINESS', 'SL'),
 ('2020', 'SN'),
 ('CSRREPORT', 'SL'),
 ('', 'SY'),
 ('', 'SY'),
 ('보고서', 'NNG'),
 ('개요', 'NNG'),
 ('삼성물산', 'NNP'),
 ('은', 'JX'),
 ('ESG', 'SL'),
 ('(', 'SSO'),
 ('Environment', 'SL'),
 ('·', 'SC'),
 ('Social', 'SL'),
 ('·', 'SC'),
 ('Governance', 'SL'),
 (')', 'SSC'),
 ('기반', 'NNG'),
 ('의', 'JKG'),
 ('비', 'XPN'),
 ('재무', 'NNG'),
 ('적', 'XSN'),
 ('성', 'NNG'),
 ('과', 'NNG'),
 ('와', 'JC'),
 ('활동', 'NNG'),
 ('을', 'JKO'),
 ('이해', 'NNG'),
 ('관계자', 'NNG'),
 ('들', 'XSN'),
 ('에게', 'JKB'),
 ('투명', 'NNG'),
 ('하', 'XSV'),
 ('게', 'EC'),
 ('공개', 'NNG'),
 ('하', 'XSV'),
 ('고', 'EC'),
 ('소통', 'NNG'),
 ('하', 'XSV'),
 ('고자', 'EC'),
 ('매년', 'MAG'),
 ('CSR', 'SL'),
 ('보', 'VV'),
 ('고서', 'EC'),
 ('를', 'JKO'),
 ('발간', 'NNG'),
 ('하', 'XSV'),
 ('고', 'EC'),
 ('있', 'VX'),
 ('으며', 'EC'),
 (',', 'SC'),
 ('2020', 'SN'),
 ('년', 'NNBC'),
 ('보고

In [25]:
from konlpy.tag import Mecab
mecab = Mecab()
stop = ["회사", "에서", "위해", "관련", "기준"]
# tokens = [token for token in mecab.pos(samsung_display)]

t_nodes = [t[0] for t in tokens_test]
t_vocab = [t[0] for t in tokens_test if t[0] not in stop if t[1] in ['NNG', 'NNP'] and len(t[0]) > 1]

print(t_nodes[:10])
print(t_vocab[:10])


['2020', 'SAMSUNG', 'C', '&', 'T', 'CSR', 'REPORT', 'WILL', 'OFTOMORROW', 'HAPPINESS']
['보고서', '개요', '삼성물산', '기반', '재무', '활동', '이해', '관계자', '투명', '공개']


In [29]:
t_vocab = list(set(t_vocab))

t_vocab2idx = {t_vocab[i]:i for i in range(len(t_vocab))}
t_idx2vocab = {i:t_vocab[i] for i in range(len(t_vocab))}

In [30]:
import numpy as np
import math

vocab_len = len(t_vocab2idx)

# 토큰별로 그래프 edge를 Matrix 형태로 생성
weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

# 각 토큰 노드별로 스코어 1로 초기화
score = np.ones((vocab_len),dtype=np.float32)

# coocurrence를 판단하기 위한 window 사이즈 설정
window_size = 4
covered_coocurrences = []

for window_start in range(len(t_nodes) - window_size + 1):
    window = t_nodes[window_start:window_start+window_size]
    for i in range(window_size):
        for j in range(i+1, window_size):
            if window[i] in t_vocab and window[j] in t_vocab:
                index_i = window_start + i
                index_j = window_start + j

                if (index_i, index_j) not in covered_coocurrences:
                    weighted_edge[t_vocab2idx[window[i]]][t_vocab2idx[window[j]]] = 1
                    weighted_edge[t_vocab2idx[window[j]]][t_vocab2idx[window[i]]] = 1
                    covered_coocurrences.append((index_i, index_j))

for i in range(vocab_len):
    row_sum = weighted_edge[i].sum()
    weighted_edge[i] = weighted_edge[i]/row_sum if row_sum > 0 else 0

MAX_ITERATIONS = 50
d=0.85
threshold = 0.0001 #convergence threshold

for iter in range(MAX_ITERATIONS):
    prev_score = np.copy(score)

    for i in range(vocab_len):
        summation = 0
        for j in range(vocab_len):
            if weighted_edge[j][i] != 0:
                summation += weighted_edge[j][i] * prev_score[j]

        score[i] = (1 - d) * d*summation

    if np.sum(np.fabs(prev_score -  score)) <= threshold:
        break


sorted_index = np.flip(np.argsort(score), 0)

n = 100


print("\n=== 핵심키워드 ===")
t_w = []
t_v = []
for i in range(0, n) :
    t_w.append(str(t_idx2vocab[sorted_index[i]]))
    t_v.append(str(score[sorted_index[i]]))


=== 핵심키워드 ===


In [31]:
t_w = []
t_v = []
for i in range(0, 100) :
    t_w.append(str(t_idx2vocab[sorted_index[i]]))
    t_v.append(str(score[sorted_index[i]]))

In [47]:
def JaccardSimilarity(list_inp1, list_inp2):

    mom = set(list_inp1).union(set(list_inp2))
    son = set(list_inp1).intersection(set(list_inp2))

    return len(son)/len(mom)

In [48]:
JaccardSimilarity(w, t_w)

0.005025125628140704