# 코퍼스에서 Window W로 동시발생행렬 구하기
- MapReduce Co-occurance method
    - Pair기법
        - 윈도우 내 각 단어에 대해 가능한 모든 단어쌍-갯수를 나타내는 키-값 테이블 구성 
        - map: 입력(문서집합)에서 대별 단어로 분할, 단어쌍-1 키값 테이블 생성
        - reduce: 키 별 합산
    - Stripe기법
        - pair기법에서 key space가 매우 클 수 있음
        - 단어를 키로, 해당 단어와 함께 발생하는 다른 모든 단어의 빈도 해시맵을 값으로 하는 키-값 테이블 구성

In [1]:
from collections import defaultdict
import itertools

def build_cooccurrence_matrix_pair(corpus, window_size):
    cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
    
    for sentence in corpus:
        words = sentence.split()
        for i, word in enumerate(words):
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    cooccurrence_matrix[word][words[j]] += 1
    
    return cooccurrence_matrix

# 예제 코퍼스
corpus = [
    "I love machine learning",
    "machine learning is great",
    "I love coding in Python"
]

window_size = 2
cooccurrence_matrix_pair = build_cooccurrence_matrix_pair(corpus, window_size)

# 출력
for word, neighbors in cooccurrence_matrix_pair.items():
    print(f"{word}: {dict(neighbors)}")

I: {'love': 2, 'machine': 1, 'coding': 1}
love: {'I': 2, 'machine': 1, 'learning': 1, 'coding': 1, 'in': 1}
machine: {'I': 1, 'love': 1, 'learning': 2, 'is': 1}
learning: {'love': 1, 'machine': 2, 'is': 1, 'great': 1}
is: {'machine': 1, 'learning': 1, 'great': 1}
great: {'learning': 1, 'is': 1}
coding: {'I': 1, 'love': 1, 'in': 1, 'Python': 1}
in: {'love': 1, 'coding': 1, 'Python': 1}
Python: {'coding': 1, 'in': 1}


In [2]:
from collections import defaultdict

def build_cooccurrence_matrix_stripe(corpus, window_size):
    cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
    
    for sentence in corpus:
        words = sentence.split()
        for i, word in enumerate(words):
            stripe = defaultdict(int)
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    stripe[words[j]] += 1
            for neighbor, count in stripe.items():
                cooccurrence_matrix[word][neighbor] += count
    
    return cooccurrence_matrix

# 예제 코퍼스
corpus = [
    "I love machine learning",
    "machine learning is great",
    "I love coding in Python"
]

window_size = 2
cooccurrence_matrix_stripe = build_cooccurrence_matrix_stripe(corpus, window_size)

# 출력
for word, neighbors in cooccurrence_matrix_stripe.items():
    print(f"{word}: {dict(neighbors)}")

I: {'love': 2, 'machine': 1, 'coding': 1}
love: {'I': 2, 'machine': 1, 'learning': 1, 'coding': 1, 'in': 1}
machine: {'I': 1, 'love': 1, 'learning': 2, 'is': 1}
learning: {'love': 1, 'machine': 2, 'is': 1, 'great': 1}
is: {'machine': 1, 'learning': 1, 'great': 1}
great: {'learning': 1, 'is': 1}
coding: {'I': 1, 'love': 1, 'in': 1, 'Python': 1}
in: {'love': 1, 'coding': 1, 'Python': 1}
Python: {'coding': 1, 'in': 1}


# Corpus에서 TF-IDF 생성
- Term Frequency 생성
    - mapper 입력: 문서
    - mapper 출력: ((term, doc), 1)
    - reducer 출력: 합산 or 문서길이로 정규화
- Inverse document frequency 계산
    - mapper 입력: ((term, doc), tf)
    - mapper 출력: (term, 1)
    - reducer 출력
        - 합산하여 해당 단어가 포함된 문서의 총 개수
        - log(total_doc_num/해당 단어 포함 문서개수)
- TF, IDF 결합
    - mapper 입력: (term ,(doc, tf, idf))
    - mapper 출력: ((term, doc), tf*idf)
        - 각 문서의 각 단어에 대한 tf-idf 점수 의미

In [3]:
import numpy as np
from collections import defaultdict
from math import log

def compute_tf(corpus):
    tf = []
    for document in corpus:
        word_count = defaultdict(int)
        words = document.split()
        for word in words:
            word_count[word] += 1
        total_words = len(words)
        tf.append({word: count / total_words for word, count in word_count.items()})
    return tf

def compute_idf(corpus):
    idf = defaultdict(int)
    total_documents = len(corpus)
    for document in corpus:
        words = set(document.split())
        for word in words:
            idf[word] += 1
    idf = {word: log(total_documents / count) for word, count in idf.items()}
    return idf

def compute_tfidf(corpus):
    tf = compute_tf(corpus)
    idf = compute_idf(corpus)
    tfidf = []
    for document_tf in tf:
        document_tfidf = {word: tf_value * idf[word] for word, tf_value in document_tf.items()}
        tfidf.append(document_tfidf)
    return tfidf

# 예제 코퍼스
corpus = [
    "I love machine learning",
    "machine learning is great",
    "I love coding in Python"
]

tfidf = compute_tfidf(corpus)

# 출력
for i, document_tfidf in enumerate(tfidf):
    print(f"Document {i+1}: {document_tfidf}")

Document 1: {'I': 0.1013662770270411, 'love': 0.1013662770270411, 'machine': 0.1013662770270411, 'learning': 0.1013662770270411}
Document 2: {'machine': 0.1013662770270411, 'learning': 0.1013662770270411, 'is': 0.27465307216702745, 'great': 0.27465307216702745}
Document 3: {'I': 0.08109302162163289, 'love': 0.08109302162163289, 'coding': 0.21972245773362198, 'in': 0.21972245773362198, 'Python': 0.21972245773362198}
