In [1]:
import joblib
import re
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from konlpy.tag import Okt

In [2]:
df = pd.read_excel("./df_sample06.xlsx", engine="openpyxl", header=0, index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 0 to 75
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   내용           76 non-null     object
 1   학교           76 non-null     object
 2   school_code  76 non-null     int64 
 3   전공           76 non-null     object
 4   진로           76 non-null     object
 5   career_code  76 non-null     int64 
 6   인성           76 non-null     object
dtypes: int64(2), object(5)
memory usage: 4.8+ KB


In [4]:
content = df['내용']

In [5]:
content.info()

<class 'pandas.core.series.Series'>
Int64Index: 76 entries, 0 to 75
Series name: 내용
Non-Null Count  Dtype 
--------------  ----- 
76 non-null     object
dtypes: object(1)
memory usage: 1.2+ KB


In [6]:
context = pd.DataFrame()

In [7]:
tagger = Okt()

for i in range(len(content)):
    texts = content[i].replace(". ", ".")
    texts = texts.split(".")
    for text in texts:
        if len(text) <= 5:
            continue
        text = re.sub(r"[^가-힣a-zA-Z0-9 ]", "", text)
        temp_df = pd.DataFrame({0:[text], 1:[i]})
        context = pd.concat([context, temp_df], ignore_index = True)

In [8]:
context.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1875 entries, 0 to 1874
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1875 non-null   object
 1   1       1875 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 29.4+ KB


In [9]:
def get_morphs(text):
    _morphs = tagger.morphs(text)
    _morphs = [word for word in _morphs if len(word) > 1]
    return _morphs

In [10]:
def get_nouns(text):
    _nouns = tagger.nouns(text)
    _nouns = [word for word in _nouns if len(word) > 1]
    return _nouns

In [11]:
tfidf = TfidfVectorizer(tokenizer=get_morphs)

In [12]:
tdm = tfidf.fit_transform(context[0])

In [13]:
tfidf.get_feature_names_out()[-10:]

array(['힘들어하여', '힘들어해', '힘들었고', '힘들었다', '힘들었던', '힘들었지만', '힘써야', '힘쓰고',
       '힘쓰고자', '힘쓸'], dtype=object)

In [63]:
def get_similarity(idx, tdm):
    axis_0, aixs_1 = tdm.shape
    if 0 < idx < axis_0-1:
        sim_1 = cosine_similarity(tdm[idx], tdm[:idx])
        sim_2 = cosine_similarity(tdm[idx], tdm[idx+1:])
        sim = np.concatenate((sim_1, sim_2), axis=1)
        
    elif idx == 0:
        sim = cosine_similarity(tdm[0], tdm[1:])
    elif idx == axis_0 - 1:
        sim = cosine_similarity(tdm[idx], tdm[:idx])
    
    sim_idx = np.argmax(sim)
    similarity = sim.max()
    if sim_idx >= idx :
        sim_idx += 1

    return sim_idx, similarity

In [64]:
label = context[1]

In [65]:
idx = 412
sim_idx, similarity = get_similarity(idx, tdm)
print(sim_idx)
print("-"*20)
print(context.loc[idx, 0])
print("-"*20)
print(context.loc[sim_idx, 0])
print("-"*20)
print(similarity)

505
--------------------
영어는 학교 수업 외에도 영미문화이해 동아리 등 다앙한 활동으로 의사소통 능력율 키웠다
--------------------
영어는 학교 공부 외에도 영어 토론과 모의유엔 영어 신문 동아리 활동에 참여해 자유로운 소통 능력 을 키웠다
--------------------
0.390797858105065


In [68]:
idx = 661
sim_idx, similarity = get_similarity(idx, tdm)
print(context.loc[idx, 0])
print("-"*20)
print(context.loc[sim_idx, 0])
print("-"*20)
print(similarity)

이외에도 장래 희망을 이루기 위해 노력해왔다
--------------------
나의 장래 희망은 중학교 영 어 교 사 다 
--------------------
0.5116894552917461


In [45]:
with open('Coverletter.pkl', 'wb') as f:
    joblib.dump({'tdm': tdm, 'context': context}, f)

In [46]:
context.to_excel("context.xlsx")

## LDA 시도

.... TDM을 문서 단위, 혹은 문단 나눈 단위로 만들어야할 듯
& 형태소 분석을 명사로 진행하여 재시도 예정

In [None]:
with open('Coverletter.pkl', 'rb') as f:
    data = joblib.load(f)
locals().update(data)

In [None]:
len(context)

In [None]:
words = tfidf.get_feature_names_out()

In [None]:
word_dict = dict(enumerate(words))

In [None]:
len(word_dict)

In [None]:
from gensim.matutils import Sparse2Corpus

In [None]:
corpus = Sparse2Corpus(tdm.T)
corpus

In [None]:
from gensim.models import LdaModel

In [None]:
lda = LdaModel(corpus=corpus, num_topics=20, id2word=word_dict)

In [None]:
lda.show_topic(5)