In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from itertools import chain
from collections import ChainMap
from scipy.sparse import csr_matrix

In [2]:
from src.textanalyzer import MecabTokenization

In [3]:
docs = pd.read_csv('/root/beauty/tok_glpk.csv', header=None).sample(10000, random_state=42)[0].tolist()

In [4]:
tokenizer = MecabTokenization(custom_dir='/root/custom_dict')

---

In [5]:
proc_docs=[]
for i, doc in enumerate(docs):
    proc_docs.append(tokenizer.mecab.morphs(doc))
    
result = list(map(lambda doc:Counter(doc), proc_docs))

def token2count(tokens):
    """
    list of Tokens -> counts
    """
    pos = {'KEYPHRASE', 'NNG', 'NNP', 'VV', 'VA', 'XR', 'SL'} # 이건 Init으로 빼자.
    keywords = []
    for tok in tokens:
        if set(tok._pos.split('+')).intersection(pos):
            # stemming
            if '+' in tok._pos:
                for s in tok.expression.split('+'):
                    a, b, _= s.split('/')
                    if b in pos:
                        stem = a
                        p = b
            else:
                stem = tok.text
                p = tok._pos

            # lemmantization
            if p in {'VV', 'VA'}:
                stem = f"{stem}다"
            keywords.append(stem)        
    return Counter(keywords)

result = list(map(lambda doc:Counter(doc), proc_docs))

def document_term_matrix(count_docs):
    """
    * candidate selection은 preprocesseddocumnets에 구현.
    * candidate weighting
    """
    # Define idx2token and token2idx for represent the data by matrix
    idx2token = sorted(dict(ChainMap(*count_docs)).keys())
    token2idx = {tok:i for i, tok in enumerate(idx2token)}
    
    # Transform list-of-dict to document-term-matrix using sparse matrix
    rows = list(chain(*[[doc_idx] * len(doc) for doc_idx, doc in enumerate(count_docs)])) # for (i,j)~DTM row-wise index position
    cols, data = list(zip(*chain(*[doc.items() for doc in count_docs]))) # term keywords, data is frequence
    cols = [token2idx[c] for c in cols] # transform term keyword to for (i,j)~DTM column-wise index position
    dtm = csr_matrix((data, (rows, cols)))
    
    return dtm, idx2token, token2idx

dtm, idx2token, token2idx = document_term_matrix(result)

# from sklearn.feature_extraction.text import TfidfTransformer

# tfidf = TfidfTransformer(norm=False)
# dtm = tfidf.fit_transform(dtm)
# scores = np.squeeze(np.asarray(dtm.sum(axis=0))) # document-wise sum

In [8]:
def text2lm(text):
    tokens = tokenizer.mecab.morphs(text)
    text_len = len(tokens)
    d = {}
    for token in tokens:
        if token not in d:
            d[token] = 0
        d[token] += 1
    return d, text_len

def cnt_corpus(docs):
    docs_terms = {}
    df = {}
    total_df = len(docs)
    total_doc_len = 0
    for i, doc in enumerate(docs):
        doc_terms, doc_len = text2lm(doc)
        docs_terms[i] = doc_terms
        for item in doc_terms:
            if item not in df:
                df[item] = 0
            df[item] += 1
        total_doc_len += doc_len
    avg_doc_len = total_doc_len / total_df
    return docs_terms, df, total_df, avg_doc_len

import argparse
import json
import re
import math

import numpy as np
from scipy import spatial
from nltk.corpus import stopwords
from scipy.linalg import norm

class ClassicExtractor():
    # classical feature extractor
    def __init__(self, query_terms, doc_terms, df, total_df=None, avg_doc_len=None):
        """
        :param query_terms: query term -> tf
        :param doc_terms: doc term -> tf
        :param df: term -> df dict
        :param total_df: a int of total document frequency
        :param avg_doc_len: a float of avg document length
        """
        query_tf = [item[1] for item in query_terms.items()]
        query_df = []
        doc_tf = []
        for item in query_terms.items():
            if item[0] in df:
                query_df.append(df[item[0]])
            else:
                query_df.append(0)
            if item[0] in doc_terms:
                doc_tf.append(doc_terms[item[0]])
            else:
                doc_tf.append(0)
        
        self.query_tf = np.array(query_tf)
        self.query_df = np.array(query_df)
        self.doc_tf = np.array(doc_tf)

        self.doc_len = sum([item[1] for item in doc_terms.items()])
        if total_df is not None:
            self.total_df = total_df
        if avg_doc_len is not None:
            self.avg_doc_len = avg_doc_len

        self.k1 = 1.2
        self.b = 0.75
        self.dir_mu = 2500
        self.min_tf = 0.1
        self.jm_lambda = 0.4
        self.min_score = 1e-10
        return

    def get_feature(self):
        # l_sim_func = ['lm', 'lm_dir', 'lm_jm', 'lm_twoway',
        #               'bm25', 'coordinate', 'cosine', 'tf_idf',
        #               'bool_and', 'bool_or']
        features = {}
        features['lm'] = self.lm()
        features['lm_dir'] = self.lm_dir()
        features['lm_jm'] = self.lm_jm()
        features['lm_twoway'] = self.lm_twoway()
        features['bm25'] = self.bm25()
        features['coordinate'] = self.coordinate()
        features['cosine'] = self.cosine()
        features['tf_idf'] = self.tf_idf()
        features['bool_and'] = self.bool_and()
        features['bool_or'] = self.bool_or()
        return features

    def lm(self):
        if self.doc_len == 0:
            return np.log(self.min_score)
        v_tf = np.maximum(self.doc_tf, self.min_tf)
        v_tf /= self.doc_len
        v_tf = np.maximum(v_tf, self.min_score)
        score = np.log(v_tf).dot(self.query_tf)
        return score

    def lm_dir(self):
        if self.doc_len == 0:
            return np.log(self.min_score)
        v_q = self.query_tf / np.sum(self.query_tf)
        v_mid = (self.doc_tf + self.dir_mu * (self.query_df / self.total_df)) / (self.doc_len + self.dir_mu)
        v_mid = np.maximum(v_mid, self.min_score)
        score = np.log(v_mid).dot(v_q)
        return score

    def lm_jm(self):
        if self.doc_len == 0:
            return np.log(self.min_score)
        v_mid = self.doc_tf / self.doc_len * (1 - self.jm_lambda) + self.jm_lambda * self.query_df / self.total_df
        v_mid = np.maximum(v_mid, self.min_score)
        score = np.log(v_mid).dot(self.query_tf)
        return score

    def lm_twoway(self):
        if self.doc_len == 0:
            return np.log(self.min_score)
        v_mid = (self.doc_tf + self.dir_mu * (self.query_df / self.total_df)) / (self.doc_len + self.dir_mu)
        v_mid = v_mid * (1 - self.jm_lambda) + self.jm_lambda * self.query_df / self.total_df
        v_mid = np.maximum(v_mid, self.min_score)
        score = np.log(v_mid).dot(self.query_tf)
        return score

    def bm25(self):
        if self.doc_len == 0:
            return 0
        v_q = self.query_tf / float(np.sum(self.query_tf))
        v_tf_part = self.doc_tf * (self.k1 + 1) / (self.doc_tf + self.k1 * (1 - self.b + self.b * self.doc_len / self.avg_doc_len))
        v_mid = (self.total_df - self.query_df + 0.5) / (self.query_df + 0.5)
        v_mid = np.maximum(v_mid, 1.0)
        v_idf_q = np.log(v_mid)
        v_idf_q = np.maximum(v_idf_q, 0)
        score = v_mid.dot(v_tf_part * v_idf_q)
        score = max(score, 1.0)
        score = np.log(score)
        return score

    def cosine(self):
        if self.doc_len == 0:
            return 0
        if sum(self.doc_tf) == 0:
            return 0
        v_q = self.query_tf / float(np.sum(self.query_tf))
        v_d = self.doc_tf / float(self.doc_len)
        score = spatial.distance.cosine(v_q, v_d)
        if math.isnan(score):
            return 0
        return score

    def coordinate(self):
        return sum(self.doc_tf > 0)

    def bool_and(self):
        if self.coordinate() == len(self.query_tf):
            return 1
        return 0

    def bool_or(self):
        return min(1, self.coordinate())

    def tf_idf(self):
        if self.doc_len == 0:
            return 0
        normed_idf = np.log(1 + self.total_df / np.maximum(self.query_df, 1))
        normed_tf = self.doc_tf / self.doc_len
        return normed_idf.dot(normed_tf)

In [10]:
docs_terms, df, total_df, avg_doc_len = cnt_corpus(docs)

In [None]:
docs_term

In [13]:
[token2idx[term] for term in ['헤라']]

[18885]

In [None]:

ce = ClassicExtractor()

In [None]:
brand = {'헤라' : ['헤라'], '랑콤' : ['랑콤'], '베네피트'  : ['베네피트']}

In [None]:
class SBSAnalyzer:
    def __init__(self, ):
        pass
    def _ngram(self):
        return
    def token2count(self):
        return
    def document_term_matrix(self):
        return
    def network(self):
        return
    
    
    
    def prevalence(self):
        return
    def diversity(self):
        return
    
    def connectivity(self):
        return
    
    def sbs(self):
        return

- open matchfh rntu

In [7]:
docs_terms, df, total_df, avg_doc_len = cnt_corpus(docs)

df -> document frequency
total_df -> total document
avg_doc_len -> total_doc_len / total_df # term들의 수

query_terms -> vector화
docs_terms -> dtm으로 보자

In [222]:
doc = '설화수'

In [223]:
query_terms = np.array([0]*dtm.shape[1])

In [224]:
for term, freq in Counter(tokenizer.mecab.morphs(doc)).items():
    query_terms[token2idx[term]] += freq
    

In [266]:
from scipy.sparse import diags

In [9]:
dtm

<10000x19751 sparse matrix of type '<class 'numpy.int64'>'
	with 570380 stored elements in Compressed Sparse Row format>

In [227]:
query_terms

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
dtm = tfidf.fit_transform(dtm)

In [28]:
scores = np.squeeze(np.asarray(dtm.sum(axis=0))) # document-wise sum

In [29]:
np.argsort(-scores)[:10]

array([  162, 18807,     0,  8475, 14392,   116,   138, 12447, 18327,
       16736])

In [26]:
np.argsort(-scores)[:10]

array([  162, 14462,  5665,     0, 18409,  3696, 13129,   116, 14357,
       14379])

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(norm=False)
dtm = tfidf.fit_transform(dtm)

In [30]:
tf = TfidfTransformer(use_idf=False, sublinear_tf=True)

In [32]:
cdtm = tf.fit_transform(dtm)

In [37]:
dtm.toarray()

array([[0.01480299, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01621181, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [36]:
cdtm.toarray()

array([[-0.18221219,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.25677745,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [19]:
scores

array([1.64724310e+04, 1.59689902e+03, 6.75585038e+02, ...,
       9.51729319e+00, 4.95871813e+01, 9.51729319e+00])

In [134]:
# df
(dtm > 0).sum(axis=0) # (1, term)

(1, 19751)

In [135]:
# total_df 
dtm.shape[0]

10000

In [146]:
# avg_doc_len
dtm.sum() / dtm.shape[0]

79.7427

In [209]:
np.argsort(-dtm.dot(query_terms.reshape(-1,1)).reshape(-1))

array([6409, 9043, 5015, ..., 7151,  466, 4999])

In [221]:
np.arr
idx2token[999]

'moisturizing'

In [None]:
q_tdm = 

In [217]:
dtm

<10000x19751 sparse matrix of type '<class 'numpy.int64'>'
	with 570380 stored elements in Compressed Sparse Row format>

In [None]:
idx2token

In [52]:
class MyExtractor:
    def __init__(self, dtm, idx2token):
        self.dtm = dtm # document-term matrix (document, term)
        self.idx2tokens = idx2token # term-list
        
        # document frequency
        self.df = (dtm > 0).sum(axis=0) # document frequency (1, term)
        self.total_df = dtm.shape[0]
        self.avg_doc_len = dtm.sum() / dtm.shape[0]
        
        # hyperparameters
        self.k1 = 1.2
        self.b = 0.75
        self.dir_mu = 2500
        self.min_tf = 0.1
        self.jm_lambda = 0.4
        self.min_score = 1e-10
    
    def bm25(self):
        v_tf_part = self.dtm * (self.k1 + 1) / (self.dtm + self.k1 * (1 - self.b + self.b * self.dtm.sum(axis=1) / self.avg_doc_len))        
        result=[]
        for i in range(len(self.idx2tokens)):
            v_mid = np.eye(len(self.idx2tokens), 1 ,i) * (self.total_df - 1 + 0.5)/ (1 + 0.5)
            v_idf_q = np.eye(1,len(self.idx2tokens),i) * np.log((self.total_df - 1 + 0.5)/ (1 + 0.5))
#             print(v_tf_part.shape, type(v_tf_part))
#             print(v_idf_q.reshape(-1).shape, type(v_idf_q))
#             print(v_mid.shape)
            scores = (np.asarray(v_tf_part) * v_idf_q.reshape(-1)).dot(v_mid)
            result.append(np.log(np.maximum(scores, 1.)))

        return result

In [53]:
me = MyExtractor(dtm, idx2token)

In [54]:
tf = me.bm25()

KeyboardInterrupt: 

In [1]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
TfidfTransformer()

In [None]:
np.broadcast_arrays()

In [36]:
np.eye(1, 10,k=1).reshape(-1).shape

(10,)

In [None]:
    # Fit_transform tfidf vector with normalization
    tfidf = TfidfTransformer(norm=False)
    dtm = tfidf.fit_transform(dtm)
    scores = np.squeeze(np.asarray(dtm.sum(axis=0))) # document-wise sum
    
    # 파레토 법칙ㅋㅋㅋ 상위 20%가 전체 컨텐츠의 80%를 대변함.
    if top_k is None:
        top_k = int(len(scores) * 0.2)

NameError: name 'dtm' is not defined

NameError: name 'tf' is not defined

In [314]:
tf.toarray()

array([[8.80482526, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 8.80482526, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 8.80482526, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 8.80482526, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 8.80482526,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        8.80482526]])

In [309]:
np.log(np.maximum(tf.toarray(), 1.))

array([[8.80482526, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 8.80482526, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 8.80482526, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 8.80482526, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 8.80482526,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        8.80482526]])

In [303]:
np.maximum(np.array([1.,3., 0.5, 2]), 1.)

array([1., 3., 1., 2.])

In [300]:
np.log(tf)

TypeError: loop of ufunc does not support argument 0 of type dia_matrix which has no callable log method

In [None]:
np.maximum()

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements (1 diagonals) in DIAgonal format>

In [None]:
np.log()

In [277]:
(me.total_df - 1 + 0.5) / 1

9999.5

In [236]:
tmp = np.array([1,2,3])

In [250]:
tmp = dtm.copy()

In [255]:
tmp.sum(axis=1).shape

(10000, 1)

In [247]:
tmp.toarray()

array([[3, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [249]:
dtm.toarray()

array([[2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [241]:
dtm

<10000x19751 sparse matrix of type '<class 'numpy.int64'>'
	with 570380 stored elements in Compressed Sparse Row format>

<bound method MyExtractor.bm25 of <__main__.MyExtractor object at 0x7f2ffedc2f10>>

In [155]:
dtm

<10000x19751 sparse matrix of type '<class 'numpy.int64'>'
	with 570380 stored elements in Compressed Sparse Row format>

In [152]:
dtm.toarray()[0,:]

array([2, 0, 0, ..., 0, 0, 0])

In [153]:
dtm.toarray()[279,:]

array([1, 0, 0, ..., 0, 0, 0])

In [214]:
extractor = ClassicExtractor(query_terms, docs_terms[201], df, total_df, avg_doc_len=avg_doc_len)
extractor.get_feature()

AttributeError: 'numpy.ndarray' object has no attribute 'items'

In [78]:
dtm[:,token2idx['약산성']]

61

In [130]:
df['약산성']

54

In [103]:
df['세정력']

169

In [129]:
sum([1 if '약산성' in counts else 0 for _, counts in docs_terms.items()])

54

In [128]:
sum([counts.get('약산성', 0) for _, counts in docs_terms.items()])

63

In [95]:
sum([1 if '세정력' in counts else 0 for _, counts in docs_terms.items()])

169

In [94]:
df['세정력']

169

In [33]:
avg_doc_len


79.7427

In [59]:
doc = '샤넬 립 괜찮은듯.'
query_terms, query_len = text2lm(doc)

In [68]:
df['']

{'약산성': 54,
 '이': 7351,
 '라': 1945,
 '순': 58,
 '하': 6485,
 '긴': 886,
 '한데': 460,
 '세정력': 169,
 '엄청': 726,
 '좋': 5977,
 '은': 5430,
 '건': 744,
 '아니': 1170,
 '에요': 1126,
 'ㅎㅎ': 700,
 '향': 1634,
 '무': 230,
 '고': 7601,
 '닦': 262,
 '폼': 131,
 '클렌징': 353,
 '으로': 3088,
 '2': 641,
 '차': 145,
 '세안': 243,
 '필수': 94,
 '입': 252,
 '니당': 186,
 '!': 3728,
 '워터': 197,
 '만': 2009,
 '쓰': 3148,
 '시': 824,
 '는': 7066,
 '분': 1095,
 '들': 2487,
 '께': 138,
 '비추': 111,
 '에': 5890,
 '요': 1253,
 '~': 769,
 '그렇': 302,
 '다고': 769,
 '안': 2858,
 '바이': 13,
 '오더': 5,
 '마': 91,
 '보다': 1394,
 '잔여물': 36,
 '계속': 391,
 '남': 535,
 '느낌': 2031,
 '입니다': 1718,
 '아주': 413,
 '잘': 3235,
 '있': 4225,
 '다': 2497,
 '.': 5573,
 '색연필': 3,
 '깍': 7,
 '기': 2705,
 '^^': 134,
 '이거': 1453,
 '로': 2359,
 '맥퀸': 2,
 '뉴욕': 3,
 '아이라이너': 66,
 '으면': 537,
 '뭉개': 7,
 '짐': 126,
 '장난아': 5,
 '님': 80,
 '..': 1253,
 '나': 2580,
 '의': 1524,
 '컬러': 434,
 '링북': 1,
 '을': 3997,
 '책임질': 1,
 '아이': 343,
 '#': 356,
 '데일리': 211,
 '템': 440,
 '헤어': 122,
 '팩': 555,
 '아서':

In [67]:
extractor = ClassicExtractor(query_terms, docs_terms[201], df, total_df, avg_doc_len=avg_doc_len)
extractor.get_feature()

In [44]:
query_terms

{'설화수': 1, '비싸': 1, '고': 1, '별로': 1, '에요': 1, '.': 1}

In [58]:
docs[201]

'엄마 한테 생일 선물로 사줬는데 일단 지속력 합격 , 발색 좋음 !! 색깔이 너무 맘에 들었어요 연한 핑크 장미색에 코랄을 살짝 찍어 넣은 듯한 그런 부드럽고 은은하면서 귀여운 느낌이랄까 역시 샤넬은 어떤 색상을 사용해도 다 찰떡같아요 ! 쵝호'

In [50]:
docs_terms[200]

{'설': 1,
 '페이드': 1,
 '없': 1,
 '는': 6,
 '샴푸': 3,
 '찾': 1,
 '다가': 1,
 '애용': 1,
 '하': 2,
 '던': 1,
 '시드': 1,
 '물': 3,
 '에서': 1,
 '발견': 1,
 '고': 8,
 '샀': 1,
 '음': 4,
 '!': 2,
 '이전': 1,
 '에': 2,
 '려': 1,
 '썼': 1,
 '는데': 2,
 '천연': 1,
 '이': 3,
 '아니': 1,
 '라': 1,
 '세정력': 1,
 '은': 2,
 '좋': 3,
 '았': 1,
 '지만': 1,
 '향': 2,
 '역겹': 1,
 '불편': 1,
 '했': 1,
 '게다가': 1,
 '청색': 1,
 '1': 2,
 '호': 1,
 '들어갔': 1,
 'ㅜㅜ근데': 1,
 '이건': 2,
 '성분': 1,
 '도': 2,
 '사용': 1,
 '감': 3,
 '비누': 1,
 '로': 5,
 '듯이': 1,
 '뻑뻑': 1,
 '함': 4,
 '기름지': 1,
 '지': 1,
 '않': 2,
 '산뜻': 1,
 '그리고': 1,
 '아토피': 1,
 '있': 1,
 '동생': 1,
 '맨날': 1,
 '머리': 2,
 '긁': 1,
 '나서': 1,
 '안': 1,
 '가렵': 1,
 '다고': 1,
 '<': 1,
 '확실': 1,
 '치': 1,
 '으니': 1,
 '며칠': 1,
 '더': 2,
 '써': 1,
 '보': 1,
 '추가': 1,
 '미지근': 1,
 '한': 2,
 '두피': 1,
 '모공': 1,
 '열': 1,
 '차': 2,
 '조금': 1,
 '다음': 1,
 '헹구': 1,
 '2': 1,
 '거품': 1,
 '내주': 1,
 '면': 1,
 '세정': 1,
 '잘': 1,
 '됨': 1}

In [39]:
extractor.get_feature()

{'lm': -32.845013833906705,
 'lm_dir': -2.7215124848325485,
 'lm_jm': -21.689076639689407,
 'lm_twoway': -16.295780807057437,
 'bm25': 0.0,
 'coordinate': 1,
 'cosine': 0.5917517095361371,
 'tf_idf': 0.0239907400293694,
 'bool_and': 0,
 'bool_or': 1}

In [22]:
from collections import Counter

In [23]:
%%timeit
tmp_1= [Counter(doc) for doc in docs]

198 ms ± 656 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%%timeit
tmp_2= [text2lm(doc) for doc in docs]

7.81 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
text2lm(docs[])

({'친구': 1,
  '꺼': 1,
  '빌려서': 1,
  '써': 1,
  '봤': 1,
  '는데': 2,
  '정말': 1,
  '짱짱': 1,
  '하': 2,
  '게': 1,
  '잘': 2,
  '올라가': 1,
  '힘': 1,
  '조절': 1,
  '못': 1,
  '시': 1,
  '면': 1,
  '속눈썹': 1,
  '이': 2,
  '쪼금': 1,
  '빠진다는': 1,
  '점': 1,
  '아쉬워': 1,
  '요': 1},
 28)

In [16]:
min(map(len, docs))

10

In [10]:
tokenizer.mecab.morphs('안녕 디지몬')

['안녕', '디지몬']

---

- 사전 업데이트

In [59]:
# from mecab import update_custom_dictionary
# update_custom_dictionary('/root/custom_dict/ap_custom_dict.csv')

---

- 돌다..

In [8]:
doc = "노란기"
tokenizer.mecab.pos(doc)

[('노란', 'VA+ETM'), ('기', 'ETN')]

In [9]:
tokenizer(doc).Tokens

[Token(DocId='20210308-103506-912355', offset=0, start=0, end=3, text='노란기')]

- 리무버
    - 아이리무버
    - 네일리무버
    - ??
- 브러쉬
    
    - 쉐도우
    - 메컵
    - 메컵베이스
- 원플러스원
    - 원플
    - 원플원
    - 원플러스원
- 리뉴얼
- 오일리
- 왁싱
- 가성비
- 

---

In [5]:
proc_docs=[]
for i, doc in enumerate(docs):
    proc_docs.append(tokenizer.mecab.morphs(doc))

- tokenizable=False 제거
- pos 추출하기

In [None]:
brand = {'헤라' : ['헤라'], '랑콤' : ['랑콤'], '베네피트'  : ['베네피트']}

class SBSAnalyzer:
    def __init__(self, ):
        pass
    def _ngram(self):
        return
    def token2count(self):
        return
    def document_term_matrix(self):
        return
    def network(self):
        return
    
    
    
    def prevalence(self):
        return
    def diversity(self):
        return
    
    def connectivity(self):
        return
    
    def sbs(self):
        return

- 상위 20%

### token2count
- list of tokens을 list of counter로 만들기.
- stem, lemantization 구현
- (추가) 적절한 pos와 조합을 고민하자

In [70]:
def token2count(tokens):
    """
    list of Tokens -> counts
    """
    pos = {'KEYPHRASE', 'NNG', 'NNP', 'VV', 'VA', 'XR', 'SL'} # 이건 Init으로 빼자.
    keywords = []
    for tok in tokens:
        if set(tok._pos.split('+')).intersection(pos):
            # stemming
            if '+' in tok._pos:
                for s in tok.expression.split('+'):
                    a, b, _= s.split('/')
                    if b in pos:
                        stem = a
                        p = b
            else:
                stem = tok.text
                p = tok._pos

            # lemmantization
            if p in {'VV', 'VA'}:
                stem = f"{stem}다"
            keywords.append(stem)        
    return Counter(keywords)

In [6]:
#     def _ngram(self, unigram: List[Token], n: int) -> List[Token]:
#         return [ngram for ngram in zip(*[unigram[i:] for i in range(n)])]

In [113]:
lambda doc:Counter(doc)

['약산성',
 '이',
 '라',
 '순',
 '하',
 '긴',
 '한데',
 '세정력',
 '이',
 '엄청',
 '좋',
 '은',
 '건',
 '아니',
 '에요',
 'ㅎㅎ',
 '향',
 '은',
 '무',
 '향',
 '이',
 '고',
 '닦',
 '고',
 '폼',
 '클렌징',
 '으로',
 '2',
 '차',
 '세안',
 '필수',
 '입',
 '니당',
 '!',
 '클렌징',
 '워터',
 '만',
 '쓰',
 '시',
 '는',
 '분',
 '들',
 '께',
 '는',
 '비추',
 '에',
 '요',
 '~',
 '그렇',
 '다고',
 '세정력',
 '이',
 '안',
 '좋',
 '은',
 '건',
 '아니',
 '고',
 '바이',
 '오더',
 '마',
 '보다',
 '는',
 '잔여물',
 '이',
 '계속',
 '남',
 '는',
 '느낌',
 '입니다',
 '!']

### dtm
- dtm 행렬 만들기..

In [7]:
def document_term_matrix(count_docs):
    """
    * candidate selection은 preprocesseddocumnets에 구현.
    * candidate weighting
    """
    # Define idx2token and token2idx for represent the data by matrix
    idx2token = sorted(dict(ChainMap(*count_docs)).keys())
    token2idx = {tok:i for i, tok in enumerate(idx2token)}
    
    # Transform list-of-dict to document-term-matrix using sparse matrix
    rows = list(chain(*[[doc_idx] * len(doc) for doc_idx, doc in enumerate(count_docs)])) # for (i,j)~DTM row-wise index position
    cols, data = list(zip(*chain(*[doc.items() for doc in count_docs]))) # term keywords, data is frequence
    cols = [token2idx[c] for c in cols] # transform term keyword to for (i,j)~DTM column-wise index position
    dtm = csr_matrix((data, (rows, cols)))
    
    return dtm, idx2token, token2idx

In [8]:
dtm, idx2token, token2idx = document_term_matrix(result)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm=False)
dtm = tfidf.fit_transform(dtm)
scores = np.squeeze(np.asarray(dtm.sum(axis=0))) # document-wise sum

In [13]:
top_k = int(len(scores) * 0.2)

In [14]:
keyphrase_args = np.argsort(-scores)[:top_k]
keyphrase = [(idx2token[i], scores[i]) for i in keyphrase_args]

In [16]:
len(keyphrase)

2505

In [61]:
brands = {'헤라', '랑콤'}
[x for x, _ in keyphrase if x in brands]

['헤라', '랑콤']

### SBS
- Prevalence

We can now proceed with the calculation of prevalence, which counts the frequency of occurrence of each brand name —  subsequently standardized considering the scores of all the words in the texts. My choice of standardization here is to subtract the mean and divide by the standard deviation. Other approaches are also possible. This step is important to compare measures carried out considering different time frames or sets of documents (e.g. brand importance on Twitter in April and May). Normalization of absolute scores is necessary before summing prevalence, diversity and connectivity to obtain the Semantic Brand Score.

In [None]:

#PREVALENCE
#Import Counter and Numpy
from collections import Counter
import numpy as np

#Create a dictionary with frequency counts for each word
countPR = Counter()
for t in texts:
    countPR.update(Counter(t))

#Calculate average score and standard deviation
avgPR = np.mean(list(countPR.values()))
stdPR = np.std(list(countPR.values()))

#Calculate standardized Prevalence for each brand
PREVALENCE = {}
for brand in brands:
    PREVALENCE[brand] = (countPR[brand] - avgPR) / stdPR
    print("Prevalence", brand, PREVALENCE[brand])

- mean, std는 전체에서

In [203]:
mu = np.mean(scores)
std = np.std(scores)

In [206]:
prevalence_top_k = [(tok, (s-mu)/std) for tok, s in keyphrase]

- DIVERSITY

In [36]:
g_dtm = dtm[:,keyphrase_args]
g_idx2token = [idx2token[idx] for idx in keyphrase_args]

In [37]:
co_occurence = g_dtm.T.dot(g_dtm)

In [38]:
df = pd.DataFrame(co_occurence.toarray(), index=g_idx2token, columns=g_idx2token)

In [39]:
edge_list = pd.melt(df.reset_index(), id_vars=['index'])

In [40]:
edge_list.columns = ['source','target','weight']

In [41]:
edge_list.shape

(6275025, 3)

In [43]:
import networkx as nx

In [45]:
link_filter = 2
G = nx.Graph()
G.add_nodes_from(edge_list.source.unique()) # node 추가
edge_set = [(s,t,{'weight':w}) for s,t,w in edge_list.loc[edge_list.weight > link_filter].values] # weight == 0인 엣지는 제외.

In [47]:
G.add_edges_from(edge_set) # edge 추가

In [48]:
# isolate
isolates = set(nx.isolates(G))
G.remove_nodes_from(isolates)

In [50]:
#INSTALL AND IMPORT THE DISTINCTIVENESS PACKAGE
#pip install -U distinctiveness
from distinctiveness.dc import distinctiveness

#DIVERSITY
#Calculate Distinctiveness Centrality
DC = distinctiveness(G, normalize = False, alpha = 1)
DIVERSITY_sequence=DC["D2"]

#Calculate average score and standard deviation
avgDI = np.mean(list(DIVERSITY_sequence.values()))
stdDI = np.std(list(DIVERSITY_sequence.values()))
#Calculate standardized Diversity for each brand
DIVERSITY = {}
for brand in brands:
    DIVERSITY[brand] = (DIVERSITY_sequence[brand] - avgDI) / stdDI
    print("Diversity", brand, DIVERSITY[brand])



NameError: name 'brands' is not defined

- Connectivity

In [None]:
#Define inverse weights 
for u,v,data in G_filtered.edges(data=True):
    if 'weight' in data and data['weight'] != 0:
        data['inverse'] = 1/data['weight']
    else:
        data['inverse'] = 1   

#CONNECTIVITY
CONNECTIVITY_sequence=nx.betweenness_centrality(G_filtered, normalized=False, weight ='inverse')
#Calculate average score and standard deviation
avgCO = np.mean(list(CONNECTIVITY_sequence.values()))
stdCO = np.std(list(CONNECTIVITY_sequence.values()))
#Calculate standardized Prevalence for each brand
CONNECTIVITY = {}
for brand in brands:
    CONNECTIVITY[brand] = (CONNECTIVITY_sequence[brand] - avgCO) / stdCO
    print("Connectivity", brand, CONNECTIVITY[brand])
Connectivity alice 0.6363388120984864
Connectivity rabbit -0.05570445522732375
The Semantic Brand Score of each brand is finally obtained by summing the standardized values of prevalence, diversity and connectivity. Different approaches are also possible.

In [7]:
#Obtain the Semantic Brand Score of each brand
SBS = {}
for brand in brands:
    SBS[brand] = PREVALENCE[brand] + DIVERSITY[brand] + CONNECTIVITY[brand]
    print("SBS", brand, SBS[brand])

In [183]:
tokenizer.mecab.pos('어느정도 좋네요')

[('어느', 'MM'), ('정도', 'NNG'), ('좋', 'VA'), ('네요', 'EC')]

In [109]:
sub[-5].features

{'semantic': None,
 'has_jongseong': False,
 'reading': '부드러워',
 'type': 'Inflect',
 'start_pos': 'VA',
 'end_pos': 'EC',
 'expression': '부드럽/VA/*+어/EC/*',
 '_pos': 'VA+EC',
 '_space': False}

In [157]:
Counter(keywords)

Counter({'데일리': 1,
         '템': 2,
         '헤어팩': 1,
         '향': 1,
         '좋다': 1,
         '사용': 4,
         '머릿결': 1,
         '부드럽다': 1,
         '지성': 1,
         '피다': 1})

In [148]:


keywords = []
for tok in tokens:
    if set(tok._pos.split('+')).intersection(pos):
        # stemming
        if '+' in tok._pos:
            for s in tok.expression.split('+'):
                a, b, _= s.split('/')
                if b in pos:
                    stem = a
                    p = b
        else:
            stem = tok.text
            p = tok._pos
        
        # lemmantization
        if p in {'VV', 'VA'}:
            stem = f"{stem}다"
        keywords.append(stem)

In [149]:
keywords

['데일리',
 '템',
 '헤어팩',
 '향',
 '좋다',
 '사용',
 '템',
 '사용',
 '머릿결',
 '부드럽다',
 '지성',
 '피다',
 '사용',
 '사용']

In [155]:
tokenizer.mecab.pos('지성두피라')

[('지성', 'NNG'), ('두', 'JX'), ('피', 'VV'), ('라', 'EC')]

In [151]:
[(tok.text, tok._pos) for tok in tokens]

[('데일리', 'NNP'),
 ('템', 'NNG'),
 ('이', 'JKS'),
 ('헤어팩', 'KEYPHRASE'),
 ('향', 'NNG'),
 ('이', 'JKS'),
 ('좋', 'VA'),
 ('아서', 'EC'),
 ('계속', 'MAG'),
 ('계속', 'MAG'),
 ('사용', 'NNG'),
 ('하', 'XSV'),
 ('는', 'ETM'),
 ('템', 'NNG'),
 ('입니다', 'VCP+EF'),
 ('.', 'SF'),
 ('이것', 'NP'),
 ('을', 'JKO'),
 ('사용', 'NNG'),
 ('하', 'XSV'),
 ('면', 'EC'),
 ('머릿결', 'NNG'),
 ('이', 'JKS'),
 ('부드러워', 'VA+EC'),
 ('지', 'VX'),
 ('긴', 'ETN+JX'),
 ('한데', 'VX+EF'),
 ('.', 'SF'),
 ('..', 'SY'),
 ('지성', 'NNG'),
 ('두', 'JX'),
 ('피', 'VV'),
 ('라', 'EF'),
 ('.', 'SF'),
 ('.', 'SY'),
 ('자주', 'MAG'),
 ('사용', 'NNG'),
 ('하', 'XSV'),
 ('기', 'ETN'),
 ('에', 'JKB'),
 ('는', 'JX'),
 ('.', 'SF'),
 ('.', 'SY'),
 ('그래서', 'MAJ'),
 ('가끔', 'MAG'),
 ('씩', 'XSN'),
 ('사용', 'NNG'),
 ('중', 'NNB'),
 ('입니다', 'VCP+EF'),
 ('!', 'SF')]

In [132]:
keywords

['데일리',
 '템',
 '헤어팩',
 '향',
 '좋',
 '사용',
 '템',
 '사용',
 '머릿결',
 '어',
 '지성',
 '피',
 '사용',
 '사용']

In [127]:
pos

'NNP'

In [126]:
tokens[0].pos

AttributeError: 'Token' object has no attribute 'pos'

In [119]:
keywords

[]

In [None]:
{'VV', 'VA'}

In [None]:
def lemantize(token):
    if ''
    token.

In [81]:
tokens[-2]._pos

'VCP+EF'

In [61]:
{'VV', 'EC'}.intersection(pos)

{'VV'}

In [63]:
tokens[-1]._pos.split('+')

['SF']

In [40]:
tokens[14]._pos

'EF'

In [69]:
def document_term_matrix(count_docs):
    """
    * candidate selection은 preprocesseddocumnets에 구현.
    * candidate weighting
    """
    # Define idx2token and token2idx for represent the data by matrix
    idx2token = sorted(dict(ChainMap(*count_docs)).keys())
    token2idx = {tok:i for i, tok in enumerate(idx2token)}
    
    # Transform list-of-dict to document-term-matrix using sparse matrix
    rows = list(chain(*[[doc_idx] * len(doc) for doc_idx, doc in enumerate(count_docs)])) # for (i,j)~DTM row-wise index position
    cols, data = list(zip(*chain(*[doc.items() for doc in count_docs]))) # term keywords, data is frequence
    cols = [token2idx[c] for c in cols] # transform term keyword to for (i,j)~DTM column-wise index position
    dtm = csr_matrix((data, (rows, cols)))
    
    return dtm, idx2token, token2idx

In [13]:
len(tokenizer._normalize(docs[i]))

335

In [12]:
len(tokenizer._preprocess(docs[i]))

327

In [14]:
tokenizer._preprocess(docs[i])

'전 02 그레이 브라운 쓰고 있는데요 이거 1년이 지나도 쓰고 있는 아이템 떨어트리지만 않으면 괜찮은 아이템 ㅎㅎ 깨지기쉬운 딱 그런타입 하지만 눈썹 브러쉬도 맘에들고 양쪽 섞어 바르면 자연스러운 눈썹 연출하기 너무 좋아요 단 너무 찍어 바르면 눈썹만 동동 떠보인다는.. 색감조절 양조절 손등에 테스트 해보시고 발라보세요 !! 한번 떨어트렸다가 진심 쪼개져서 난감햇다는 .. 어휴 양 진짜 안줄어요 ㅋㅋㅋㅋㅋㅋ신기해요 앞으로 2년은 더쓸걸요 뭉치지 않게만 살살 브러쉬로 풀어주면 잘 지워지지도 않고 정말 유용템임!!  눈썹브로우추천   눈썹브로우추천   눈썹브로우추천   눈썹브로우추천'

In [17]:
import re

In [19]:
re.compile('#평가단\*')

re.compile(r'#평가단\*', re.UNICODE)

In [None]:
f'#([{emojis}\w-]+)'

In [16]:
process.hashtag.findall(docs[i])

['눈썹브로우추천', '자연스러운브로우', '오래쓰는브로우', '더페이스샵브로우마스터아이브로우키트추천']

In [15]:
docs[i]

'전 02 그레이 브라운 쓰고 있는데요 이거 1년이 지나도 쓰고 있는 아이템 떨어트리지만 않으면 괜찮은 아이템 ㅎㅎ 깨지기쉬운 딱 그런타입 하지만 눈썹 브러쉬도 맘에들고 양쪽 섞어 바르면 자연스러운 눈썹 연출하기 너무 좋아요 단 너무 찍어 바르면 눈썹만 동동 떠보인다는.. 색감조절 양조절 손등에 테스트 해보시고 발라보세요 !! 한번 떨어트렸다가 진심 쪼개져서 난감햇다는 .. 어휴 양 진짜 안줄어요 ㅋㅋㅋㅋㅋㅋ신기해요 앞으로 2년은 더쓸걸요 뭉치지 않게만 살살 브러쉬로 풀어주면 잘 지워지지도 않고 정말 유용템임!! #눈썹브로우추천 #자연스러운브로우 #오래쓰는브로우 #더페이스샵브로우마스터아이브로우키트추천'

In [6]:
from src.textanalyzer import KoPreprocessing

In [7]:
process = KoPreprocessing()

In [35]:
"느그{".strip("$()*+.?[\^{|")

'느그'

In [24]:
p = re.compile(f"#{process.hashtag.findall(docs[445])[0].strip('*!')}")

In [27]:
p.sub(f" {process.hashtag.findall(docs[445])[0].strip('*!')} ", docs[445])

SyntaxError: f-string expression part cannot include a backslash (<ipython-input-27-74d8cffb7542>, line 1)

In [23]:
process.hashtag.sub(f" {'평가단리뷰'} ",docs[445])

' 평가단리뷰  피부타입 :티존 지성, 유존 건성인 복합성피부 (가끔 여드름 올라옴)* 향 :오렌지껍질향 (전성분 맨마지막 왕귤껍질오일 때문인거같네요.* 제형 :묽은 로션+수분크림 느낌* 느낌 :수분감이 넘쳤고 그렇다고 수분감만 넘치면 건조한느낌이 빨리 오는데 유분도 적당하게 들어가서 건조해지지 않았어요.수분감도 팍! 유분감도 팍!피부가 예민해서 트러블 잘 올라오는데 다행히 이제품은 트러블 없습니다.수분크림없어도 이 제품만 발라도 되더라구요.저에게 가장 큰 효과는 속건조가 느껴지지않아서 요즘도 잘 쓰고있습니다.* 아쉬운점:용기가 바디로션용처럼 나온것같아요. 한번 펌핑하면 양이 너무 많아서 반만 펌핑중입니다. 가격을 생각하면 용기도 좀 저렴해보이구여용량이 적어 아쉽네요.제품을 무상으로 제공받아 사용 후 솔직하게 작성된 리뷰입니다.'

In [23]:
len(docs[445])

411

In [31]:
Counter(docs[445])[' ']

80

In [32]:
80/411

0.19464720194647203

In [20]:
process.replace_hashtag(docs[445])

' 평가단리뷰* * 피부타입 :티존 지성, 유존 건성인 복합성피부 (가끔 여드름 올라옴)* 향 :오렌지껍질향 (전성분 맨마지막 왕귤껍질오일 때문인거같네요.* 제형 :묽은 로션+수분크림 느낌* 느낌 :수분감이 넘쳤고 그렇다고 수분감만 넘치면 건조한느낌이 빨리 오는데 유분도 적당하게 들어가서 건조해지지 않았어요.수분감도 팍! 유분감도 팍!피부가 예민해서 트러블 잘 올라오는데 다행히 이제품은 트러블 없습니다.수분크림없어도 이 제품만 발라도 되더라구요.저에게 가장 큰 효과는 속건조가 느껴지지않아서 요즘도 잘 쓰고있습니다.* 아쉬운점:용기가 바디로션용처럼 나온것같아요. 한번 펌핑하면 양이 너무 많아서 반만 펌핑중입니다. 가격을 생각하면 용기도 좀 저렴해보이구여용량이 적어 아쉽네요.제품을 무상으로 제공받아 사용 후 솔직하게 작성된 리뷰입니다.'

In [17]:
tokenizer._preprocess(docs[445])

'평가단리뷰* * 피부타입 :티존 지성, 유존 건성인 복합성피부 (가끔 여드름 올라옴)* 향 :오렌지껍질향 (전성분 맨마지막 왕귤껍질오일 때문인거같네요.* 제형 :묽은 로션+수분크림 느낌* 느낌 :수분감이 넘쳤고 그렇다고 수분감만 넘치면 건조한느낌이 빨리 오는데 유분도 적당하게 들어가서 건조해지지 않았어요.수분감도 팍! 유분감도 팍!피부가 예민해서 트러블 잘 올라오는데 다행히 이제품은 트러블 없습니다.수분크림없어도 이 제품만 발라도 되더라구요.저에게 가장 큰 효과는 속건조가 느껴지지않아서 요즘도 잘 쓰고있습니다.* 아쉬운점:용기가 바디로션용처럼 나온것같아요. 한번 펌핑하면 양이 너무 많아서 반만 펌핑중입니다. 가격을 생각하면 용기도 좀 저렴해보이구여용량이 적어 아쉽네요.제품을 무상으로 제공받아 사용 후 솔직하게 작성된 리뷰입니다.'

In [15]:
tokenizer._normalize(docs[445])

'#평가단리뷰* 피부타입 :티존 지성, 유존 건성인 복합성피부 (가끔 여드름 올라옴)* 향 :오렌지껍질향 (전성분 맨마지막 왕귤껍질오일 때문인거같네요.* 제형 :묽은 로션+수분크림 느낌* 느낌 :수분감이 넘쳤고 그렇다고 수분감만 넘치면 건조한느낌이 빨리 오는데 유분도 적당하게 들어가서 건조해지지 않았어요.수분감도 팍! 유분감도 팍!피부가 예민해서 트러블 잘 올라오는데 다행히 이제품은 트러블 없습니다.수분크림없어도 이 제품만 발라도 되더라구요.저에게 가장 큰 효과는 속건조가 느껴지지않아서 요즘도 잘 쓰고있습니다.* 아쉬운점:용기가 바디로션용처럼 나온것같아요. 한번 펌핑하면 양이 너무 많아서 반만 펌핑중입니다. 가격을 생각하면 용기도 좀 저렴해보이구여용량이 적어 아쉽네요.제품을 무상으로 제공받아 사용 후 솔직하게 작성된 리뷰입니다.'

In [33]:
from pororo import Pororo

In [34]:
spacing= Pororo(task='gec', lang='ko')

2021-03-03 22:48:42,992-INFO: [input] dictionary: 4005 types
2021-03-03 22:48:42,994-INFO: [label] dictionary: 9 types


As of now, this beta model tries to correct spacing errors in Korean text.


In [8]:
docs[445]

'#평가단리뷰* 피부타입 :티존 지성, 유존 건성인 복합성피부 (가끔 여드름 올라옴)* 향 :오렌지껍질향 (전성분 맨마지막 왕귤껍질오일 때문인거같네요.* 제형 :묽은 로션+수분크림 느낌* 느낌 :수분감이 넘쳤고 그렇다고 수분감만 넘치면 건조한느낌이 빨리 오는데 유분도 적당하게 들어가서 건조해지지 않았어요.수분감도 팍! 유분감도 팍!피부가 예민해서 트러블 잘 올라오는데 다행히 이제품은 트러블 없습니다.수분크림없어도 이 제품만 발라도 되더라구요.저에게 가장 큰 효과는 속건조가 느껴지지않아서 요즘도 잘 쓰고있습니다.* 아쉬운점:용기가 바디로션용처럼 나온것같아요. 한번 펌핑하면 양이 너무 많아서 반만 펌핑중입니다. 가격을 생각하면 용기도 좀 저렴해보이구여용량이 적어 아쉽네요.제품을 무상으로 제공받아 사용 후 솔직하게 작성된 리뷰입니다.'

In [None]:
tokenizer = MecabTokenization(custom_dir='/root/custom_dict')

pos = {'KEYPHRASE', 'NNG', 'NNP', 'VV', 'VA', 'XR', 'SL', 'SY'}

doc = '손에 붙거나 머리에 발랐을때 뭉치는 현상 없어서 좋아요. 지금까지 삿던 오일제품중 제일 좋아요. 계속 써보고 다음에도 구입할께요'
[t for t, p in tokenizer(doc) if p in pos]



19 + 2 + 738 + 6 + 63 + 712 + 879 + 1946

from itertools import chain

chain()



tokenizer(doc).Tokens[3]



class SBSAnalyzer:
    def __init__(self):
        self.pos = {'KEYPHRASE', 'NNG', 'NNP', 'VV', 'VA', 'XR', 'SL', 'SY'}
        
    @property
    def keyphrase(self):
        

class MecabTCG(TokenCandidateGeneration):
    def __init__(self, ngram=1):
        self.pos = {'KEYPHRASE', 'NNG', 'NNP', 'VV', 'VA', 'XR', 'SL', 'SY'}
        self.N = ngram
    
    def get_candidate(self, doc: Doc) -> Doc:
        if doc.tokenizable:
            unigram = [token for token in doc.tokens if (token.pos in self.pos) and (token.text.strip() != '')]
            candidates = [unigram]
            if self.N > 1:
                for i in range(1, self.N):
                    candidates.append(self._ngram(unigram, (i+1)))
            doc.candidates = candidates
            return doc
        else:
            doc.candidates = [[]]
            return doc
    
    def _ngram(self, unigram: List[Token], n: int) -> List[Token]:
        return [ngram for ngram in zip(*[unigram[i:] for i in range(n)])]

- keywords
    - 다양한 로직으로 정해짐.
- prevalence
- diversity
- connectivity
- sbs

- brand how?

prevalence
diversity

다큐먼트...


tokens = tokenizer('커버력짱인듯 다크닝도없고 참 좋아요 프라이머랑 같이구매했는데 아직까진 대만족합니당수정화장용으로 좋은거같아요~').Tokens