# 인공지능 HW1
## -문서별 TF-IDF가 높은 상위 5개 단어 출력하기-
#### 담당 교수님: 김학수 교수님
#### 201711719 응용통계학과 심은선
#### 제출일: 2020.09.10
####    

In [1]:
import re
import numpy as np
import collections
from typing import List

## 1. 파일 읽기 및 전처리(숫자, 특수문자 제외)

In [2]:
files = ['./wk1.단어중요도구하기/문서1.txt', 
         './wk1.단어중요도구하기/문서2.txt',
        './wk1.단어중요도구하기/문서3.txt']

In [3]:
# 파일 읽기
def read_files(file_paths: List):
    texts = []
    for idx, file in enumerate(file_paths):
        with open(file, 'r', encoding='utf-8') as inFile:
            lines = inFile.readlines()
            text = ''
            for line in lines:
                text += line
            texts.append(text) 
            
    return texts


# 정규표현식으로 숫자, 특수문자 제외, 소문자로 통일
def preprocess(docs: List):
    preprocessed_texts = []
    for text in docs:
        preprocessed_texts.append(re.sub('[^a-z^A-Z]', ' ', text).lower())
    
    return preprocessed_texts

In [4]:
#원본 문서
docs = read_files(files)
print(docs)

['Konkuk University is one of the comprehensive private universities located in Seoul and Chungju.\nThe Seoul campus is located in the southeastern part of Seoul, near the Han River, and is served by a metro station of the same name.\nThe Seoul campus has 13 undergraduate colleges and 13 graduate schools, whereas the Glocal Campus in Chungju is composed of 7 undergraduate colleges and 2 graduate schools.\nCurrently, 23,000 students are attending the 238,980m² Seoul campus and 15,500 students attending the 31,763m² Global Campus in Chungju, which is about an hour away from the Seoul Campus. ', 'Computer engineering (CpE) is a branch of engineering that integrates several fields of computer science and electronic engineering required to develop computer hardware and software.\nComputer engineers usually have training in electronic engineering (or electrical engineering), software design, and hardware-software integration instead of only software engineering or electronic engineering.\nCo

In [5]:
#전처리한 문서
preprocessed_docs = preprocess(docs)
print(preprocessed_docs)

['konkuk university is one of the comprehensive private universities located in seoul and chungju  the seoul campus is located in the southeastern part of seoul  near the han river  and is served by a metro station of the same name  the seoul campus has    undergraduate colleges and    graduate schools  whereas the glocal campus in chungju is composed of   undergraduate colleges and   graduate schools  currently         students are attending the        m  seoul campus and        students attending the       m  global campus in chungju  which is about an hour away from the seoul campus  ', 'computer engineering  cpe  is a branch of engineering that integrates several fields of computer science and electronic engineering required to develop computer hardware and software  computer engineers usually have training in electronic engineering  or electrical engineering   software design  and hardware software integration instead of only software engineering or electronic engineering  compute

## 2. TF-IDF 계산하기

In [6]:
# 문서별 TF 구하기 (문서별로 단어 등장 횟수 count, 공백 기준 분리)
def get_TF(docs: List):
    TF = [] 
    for idx, text in enumerate(docs):
        words = text.split()
        word_count = collections.Counter(words)
        TF.append(word_count)
    
    return TF


# DF 구하기
def get_DF(docs: List):
    word_ls = []
    for doc in docs:
        word_ls += list(set(doc.split()))
    df = collections.Counter(word_ls) #문서 전체의 df
    return df



# 문서별 TF-IDF 구하기
def get_TFIDF(file_paths: List):
    docs = read_files(file_paths)
    preprocessed_docs = preprocess(docs)
    tf = get_TF(preprocessed_docs)
    df = get_DF(preprocessed_docs)
    N = len(docs)
    
    docs_tf_idf = []
    for word_count in tf: #문서마다 반복
        doc_tf_idf = dict()
        for word, count in word_count.items(): # 문서의 단어별로 tf-idf를 구한다
            word_tf_idf = count * np.log(N / df[word])
            doc_tf_idf[word] = word_tf_idf
        docs_tf_idf.append(doc_tf_idf)
    
    return docs_tf_idf

In [7]:
#문서별 TF
tf = get_TF(preprocessed_docs)
print(tf)

[Counter({'the': 10, 'seoul': 6, 'campus': 6, 'is': 5, 'and': 5, 'of': 4, 'in': 4, 'chungju': 3, 'located': 2, 'undergraduate': 2, 'colleges': 2, 'graduate': 2, 'schools': 2, 'students': 2, 'attending': 2, 'm': 2, 'konkuk': 1, 'university': 1, 'one': 1, 'comprehensive': 1, 'private': 1, 'universities': 1, 'southeastern': 1, 'part': 1, 'near': 1, 'han': 1, 'river': 1, 'served': 1, 'by': 1, 'a': 1, 'metro': 1, 'station': 1, 'same': 1, 'name': 1, 'has': 1, 'whereas': 1, 'glocal': 1, 'composed': 1, 'currently': 1, 'are': 1, 'global': 1, 'which': 1, 'about': 1, 'an': 1, 'hour': 1, 'away': 1, 'from': 1}), Counter({'engineering': 8, 'computer': 6, 'of': 6, 'and': 5, 'software': 5, 'electronic': 3, 'hardware': 3, 'design': 3, 'to': 2, 'engineers': 2, 'in': 2, 'or': 2, 'only': 2, 'the': 2, 'how': 2, 'cpe': 1, 'is': 1, 'a': 1, 'branch': 1, 'that': 1, 'integrates': 1, 'several': 1, 'fields': 1, 'science': 1, 'required': 1, 'develop': 1, 'usually': 1, 'have': 1, 'training': 1, 'electrical': 1, 'in

In [8]:
#문서 전체 DF
df = get_DF(preprocessed_docs)
print(df)

Counter({'the': 3, 'is': 3, 'and': 3, 'of': 3, 'are': 2, 'in': 2, 'from': 2, 'by': 2, 'a': 2, 'to': 2, 'computers': 2, 'field': 2, 'that': 2, 'or': 2, 'konkuk': 1, 'han': 1, 'name': 1, 'currently': 1, 'whereas': 1, 'students': 1, 'graduate': 1, 'away': 1, 'same': 1, 'which': 1, 'colleges': 1, 'composed': 1, 'attending': 1, 'part': 1, 'has': 1, 'private': 1, 'one': 1, 'comprehensive': 1, 'chungju': 1, 'an': 1, 'about': 1, 'station': 1, 'global': 1, 'glocal': 1, 'universities': 1, 'campus': 1, 'located': 1, 'undergraduate': 1, 'river': 1, 'served': 1, 'seoul': 1, 'schools': 1, 'near': 1, 'hour': 1, 'm': 1, 'southeastern': 1, 'metro': 1, 'university': 1, 'but': 1, 'work': 1, 'hardware': 1, 'required': 1, 'focuses': 1, 'this': 1, 'how': 1, 'electronic': 1, 'develop': 1, 'engineers': 1, 'electrical': 1, 'training': 1, 'not': 1, 'circuit': 1, 'picture': 1, 'also': 1, 'integration': 1, 'cpe': 1, 'involved': 1, 'only': 1, 'on': 1, 'design': 1, 'engineering': 1, 'supercomputers': 1, 'fields': 1

In [9]:
#문서별 TF-IDF
docs_tf_idf = get_TFIDF(files)
print(docs_tf_idf)

[{'konkuk': 1.0986122886681098, 'university': 1.0986122886681098, 'is': 0.0, 'one': 1.0986122886681098, 'of': 0.0, 'the': 0.0, 'comprehensive': 1.0986122886681098, 'private': 1.0986122886681098, 'universities': 1.0986122886681098, 'located': 2.1972245773362196, 'in': 1.6218604324326575, 'seoul': 6.591673732008658, 'and': 0.0, 'chungju': 3.295836866004329, 'campus': 6.591673732008658, 'southeastern': 1.0986122886681098, 'part': 1.0986122886681098, 'near': 1.0986122886681098, 'han': 1.0986122886681098, 'river': 1.0986122886681098, 'served': 1.0986122886681098, 'by': 0.4054651081081644, 'a': 0.4054651081081644, 'metro': 1.0986122886681098, 'station': 1.0986122886681098, 'same': 1.0986122886681098, 'name': 1.0986122886681098, 'has': 1.0986122886681098, 'undergraduate': 2.1972245773362196, 'colleges': 2.1972245773362196, 'graduate': 2.1972245773362196, 'schools': 2.1972245773362196, 'whereas': 1.0986122886681098, 'glocal': 1.0986122886681098, 'composed': 1.0986122886681098, 'currently': 1.0

## 3. 문서별 상위 5개 단어 출력

In [10]:
def print_TFIDF(docs_tf_idf: dict, k:int):
    #문서 내에서 TF-IDF가 높은 순으로 정렬하기
    sort_docs_tf_idf = []
    for doc_tf_idf in docs_tf_idf:
        sort_docs_tf_idf.append(sorted(doc_tf_idf.items(), key=lambda x: x[1], reverse=True))
    
    # 문서별 TF-IDF가 높은 상위 5개 단어 출력하기
    for doc in range(len(docs_tf_idf)):
        print( '[' + str(doc+1) + ' 번째 문서의 TF-IDF가 높은 상위 5개 단어 리스트]')
        tf_idf = sort_docs_tf_idf[doc]
        for i in range(k):
            print('단어: ' + tf_idf[i][0] +',\t' + 'TF-IDF값: ' + str(tf_idf[i][1]))
        print('\n')

In [11]:
#상위 5개 단어 출력하기
print_TFIDF(get_TFIDF(files), 5)

[1 번째 문서의 TF-IDF가 높은 상위 5개 단어 리스트]
단어: seoul,	TF-IDF값: 6.591673732008658
단어: campus,	TF-IDF값: 6.591673732008658
단어: chungju,	TF-IDF값: 3.295836866004329
단어: located,	TF-IDF값: 2.1972245773362196
단어: undergraduate,	TF-IDF값: 2.1972245773362196


[2 번째 문서의 TF-IDF가 높은 상위 5개 단어 리스트]
단어: engineering,	TF-IDF값: 8.788898309344878
단어: computer,	TF-IDF값: 6.591673732008658
단어: software,	TF-IDF값: 5.493061443340549
단어: electronic,	TF-IDF값: 3.295836866004329
단어: hardware,	TF-IDF값: 3.295836866004329


[3 번째 문서의 TF-IDF가 높은 상위 5개 단어 리스트]
단어: intelligence,	TF-IDF값: 5.493061443340549
단어: its,	TF-IDF값: 3.295836866004329
단어: artificial,	TF-IDF값: 2.1972245773362196
단어: ai,	TF-IDF값: 2.1972245773362196
단어: machines,	TF-IDF값: 2.1972245773362196


