In [28]:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import re
import os
import numpy as np
import math
import pandas as pd

In [2]:
def read_pdf_file(pdfFile):
    pdfrm = PDFResourceManager()
    strio = StringIO()
    lapa = LAParams()
    device = TextConverter(pdfrm, strio, laparams = lapa)
    
    process_pdf(pdfrm, device, pdfFile)
    device.close()
    
    content = strio.getvalue()
    strio.close()
    return content

In [3]:
pdf_a = open("data/ESG_Mid/더블유원.pdf", "rb")
a = read_pdf_file(pdf_a)
pdf_a.close() 

pdf_b = open("data/ESG_Mid/동림푸드.pdf", "rb")
b = read_pdf_file(pdf_b)
pdf_b.close()

pdf_c = open("data/ESG_Mid/선일금고.pdf", "rb")
c = read_pdf_file(pdf_c)
pdf_c.close()

pdf_d = open("data/ESG_Mid/세림비앤지.pdf", "rb")
d = read_pdf_file(pdf_d)
pdf_d.close()

pdf_e = open("data/ESG_Mid/지리산한지.pdf", "rb")
e = read_pdf_file(pdf_e)
pdf_e.close()

pdf_f = open("data/ESG_Mid/창명제어기술.pdf", "rb")
f = read_pdf_file(pdf_f)
pdf_f.close() 

pdf_g = open("data/ESG_Mid/티제이에스.pdf", "rb")
g = read_pdf_file(pdf_g)
pdf_g.close()

pdf_h = open("data/ESG_Mid/한아툴스.pdf", "rb")
h = read_pdf_file(pdf_h)
pdf_h.close()

pdf_i = open("data/ESG_Mid/현대아이티.pdf", "rb")
i = read_pdf_file(pdf_i)
pdf_i.close()



In [4]:
from konlpy.tag import Mecab
mecab = Mecab()

def word_token (x) :
    tokens = []
    for token in mecab.pos(x):
        tokens.append(token)
    return tokens

a = word_token(a)
b = word_token(b)
c = word_token(c)
d = word_token(d)
e = word_token(e)
f = word_token(f)
g = word_token(g)
h = word_token(h)
i = word_token(i)

In [5]:
from konlpy.tag import Mecab
mecab = Mecab()
stop = ["회사", "에서", "위해", "관련", "기준"]

def vocab_nodes (x):
    nodes = [t[0] for t in x]
    vocab = [t[0] for t in x if t[0] not in stop if t[1] in ['NNG', 'NNP'] and len(t[0]) > 1]
    
    vocab = list(set(vocab))

    vocab2idx = {vocab[i]:i for i in range(len(vocab))}
    idx2vocab = {i:vocab[i] for i in range(len(vocab))}
    
    vocab_len = len(vocab2idx)

    # 토큰별로 그래프 edge를 Matrix 형태로 생성
    weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

    # 각 토큰 노드별로 스코어 1로 초기화
    score = np.ones((vocab_len),dtype=np.float32)

    # coocurrence를 판단하기 위한 window 사이즈 설정
    window_size = 4
    covered_coocurrences = []

    for window_start in range(len(nodes) - window_size + 1):
        window = nodes[window_start:window_start+window_size]
        for i in range(window_size):
            for j in range(i+1, window_size):
                if window[i] in vocab and window[j] in vocab:
                    index_i = window_start + i
                    index_j = window_start + j

                    if (index_i, index_j) not in covered_coocurrences:
                        weighted_edge[vocab2idx[window[i]]][vocab2idx[window[j]]] = 1
                        weighted_edge[vocab2idx[window[j]]][vocab2idx[window[i]]] = 1
                        covered_coocurrences.append((index_i, index_j))

    for i in range(vocab_len):
        row_sum = weighted_edge[i].sum()
        weighted_edge[i] = weighted_edge[i]/row_sum if row_sum > 0 else 0

    MAX_ITERATIONS = 50
    d=0.85
    threshold = 0.0001 #convergence threshold

    for iter in range(MAX_ITERATIONS):
        prev_score = np.copy(score)

        for i in range(vocab_len):
            summation = 0
            for j in range(vocab_len):
                if weighted_edge[j][i] != 0:
                    summation += weighted_edge[j][i] * prev_score[j]

            score[i] = (1 - d) * d*summation

        if np.sum(np.fabs(prev_score -  score)) <= threshold:
            break


    sorted_index = np.flip(np.argsort(score), 0)

    w = []
    v = []
    for i in range(0, 100) :
        w.append(str(idx2vocab[sorted_index[i]]))
        v.append(str(score[sorted_index[i]]))
    return w, v

In [6]:
w0, v0 = vocab_nodes (a)
w1, v1 = vocab_nodes (b)
w2, v2 = vocab_nodes (c)
w3, v3 = vocab_nodes (d)
w4, v4 = vocab_nodes (e)
w5, v5 = vocab_nodes (f)
w6, v6 = vocab_nodes (g)
w7, v7 = vocab_nodes (h)
w8, v8 = vocab_nodes (i)

In [7]:
attr = []
for i in range(1, 101):
    attr.append(f"속성{i}")

In [8]:
w0 = pd.Series(w0, attr)
w1 = pd.Series(w1, attr)
w2 = pd.Series(w2, attr)
w3 = pd.Series(w3, attr)
w4 = pd.Series(w4, attr)
w5 = pd.Series(w5, attr)
w6 = pd.Series(w6, attr)
w7 = pd.Series(w7, attr)
w8 = pd.Series(w8, attr)

In [9]:
df = pd.DataFrame([w0, w1, w2, w3, w4, w5, w6, w7, w8], index=[1, 2, 3, 4, 5, 6, 7, 8, 9])

In [47]:
df_result = pd.DataFrame(index=range(0, 9), columns=["result"])

In [18]:
def label (x) :
    col_list = list(df[x])
    en = LabelEncoder()
    en.fit(df[x])
    x_list = en.transform(df[x])
    
    return x_list

In [19]:
a = []
for i in df:
    a.append(label(f"{i}"))

In [36]:
df_label = pd.DataFrame(data = a, columns=[x for x in range(1, 10)])

In [42]:
df_label = df_label.transpose()

In [37]:
df_standard = pd.read_csv("data/csv/label.csv")

In [38]:
df_train = df_standard.drop(["y", "Unnamed: 0"], axis=1, inplace = False)
df_target = df_standard["y"]

In [48]:
model1 = RandomForestRegressor()

model1.fit(df_train, df_target)


pred1 = model1.predict(df_label)

df_result['result'] = pred1

In [49]:
df_result

Unnamed: 0,result
0,93.478
1,93.469
2,93.1
3,93.075
4,93.5
5,93.19
6,93.025
7,93.499
8,93.22
