### 단어\_태그\_T/F(받침여부)_원단어의발음
### https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=4
# 명사 NNG, 동사 VV, 형용사 VA


In [1]:
from konlpy.tag import Mecab
import pickle
import re
import sys
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from pprint import pprint
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import mglearn
from pprint import pprint
import numpy as np
import gc
import copy
import pandas as pd


class SB_Word2Vec():    
    
    def __init__(self, morph_list):
        self.dct = Dictionary(morph_list)
        self.corpus = [self.dct.doc2bow(line) for line in morph_list]
        self.build_Word2Vec(morph_list)
    
    def make_Word2Vec(self, morph_list, size=50, window=2, min_count=10, iteration=100):
        self.em = Word2Vec(morph_list, size=size, window=window, min_count=min_count, iter=iteration)
        self.em_vocab = list(self.em.wv.vocab.keys())
        self.em_vocab_dic = {word:idx for idx, word in enumerate(self.em_vocab)}

    def make_Word2Sen_matrix(self): 
        vocab_size = len(self.em_vocab)
        self.sen_matrix = np.zeros((len(self.corpus), vocab_size))
        for idx, row in enumerate(self.sen_matrix):
            for idx2, frequency in self.corpus[idx]:
                    if self.dct[idx2] in self.em_vocab:
                        self.sen_matrix[idx][self.em_vocab_dic[self.dct[idx2]]] = frequency                
        self.sim_matrix = np.zeros((vocab_size, vocab_size))
        for idx, w1 in enumerate(self.em_vocab):
            for idx2, w2 in enumerate(self.em_vocab):
                self.sim_matrix[idx][idx2] =  self.em.wv.similarity(w1, w2)

        self.word2sen_matrix = np.dot(self.sim_matrix, np.transpose(self.sen_matrix))

        return self.word2sen_matrix

    def get_sim_sen(self, keyword, main_text, number=1):
        self.sim_sen_index = np.argsort(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        self.most_sim_sen_index = np.argmax(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        index_list = self.sim_sen_index.reshape((-1,)).tolist()
        index_list.reverse()
        
        for idx, i in enumerate(index_list[:number]):
            print(str(idx + 1))
            print(main_text[i])
        return index_list
    
    def build_Word2Vec(self, morph_list):
        self.make_Word2Vec(morph_list)
        self.make_Word2Sen_matrix()
        
        
class SB_LDA():

    def make_lda(self, morph_joined, ntopic=10, learning_method='batch', max_iter=25, random_state=0, n_words=20):        
        self.vect = CountVectorizer(max_features=10000, max_df=.15)
        self.X = self.vect.fit_transform(morph_joined)
        self.lda = LatentDirichletAllocation(n_components=ntopic, learning_method=learning_method, max_iter=max_iter, random_state=random_state)
        self.document_topics = self.lda.fit_transform(self.X)
        self.sorting = np.argsort(self.lda.components_, axis=1)[:, ::-1]
        self.feature_names = np.array(self.vect.get_feature_names())
        mglearn.tools.print_topics(topics=range(ntopic), feature_names=self.feature_names, sorting=self.sorting, topics_per_chunk=5, n_words=n_words)

    def related_doc(self, main_text_list, topic_index, number=10):
        category = np.argsort(self.document_topics[:, topic_index])[::-1]
        related_docs = []
        for i in category[:number]:
            print(i)
            print(main_text_list[i] + ".\n")
            related_docs.append((i, main_text_list[i]))
        return related_docs

class SB_Tfidf():    
    
    def __init__(self, list_morph_merged):
        self.list_morph_merged = list_morph_merged
        self.dct = Dictionary(self.list_morph_merged)
        self.corpus = [self.dct.doc2bow(line) for line in self.list_morph_merged]

    def get_tfidf(self):       
        self.model = TfidfModel(self.corpus)
        self.tfidf = []
        for i in self.corpus:
             self.tfidf.append(sorted(self.model[i], key = lambda x: x[1], reverse=True))
        self.tfidf_hangul = []
        for idx1, i in enumerate(self.tfidf):
            self.tfidf_hangul.append([(self.dct[j[0]], j[1]) for j in i])        
        
        return self.tfidf_hangul
    
def frequency(merged):
    word_count = Counter(merged)
    word_count2 = []
    for i in word_count:
        word_count2.append((i, word_count[i]))
    word_count2 = sorted(word_count2, key=lambda x: x[1], reverse = True)
    return word_count2






class Social_analysis():
    
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
    syn_dic = {}
    theme_dic = {}
    del_list = []
    ngram_dic = {}
    exception_list=['맛', '밥', '물', '몸', '없', '있', '싫', '달', '굳', '굿', '속']
    
    default_dic_path = 'Data/custom_dic.csv'
    replace_dic = 'Data/replace_dic.csv'
    
    
    def __init__(self):
        self.mecab = Mecab()
        try:
            self.load_dictionary()
        except Exception as e:
            print('dictionary error\n', e)
    def load_dictionary(self, mode='default'):
        path = self.default_dic_path
        self.dic_df = pd.read_csv(path, encoding='cp949')
        for i in range(len(self.dic_df)):
            key = self.dic_df.loc[i,'key']
            value = self.dic_df.loc[i, 'value']
            syn = self.dic_df.loc[i, 'syn']
            theme = self.dic_df.loc[i, 'theme']

            if pd.isna(value):
                print('Need key & value')
                return

            self.ngram_dic[key] = value
            
            if not pd.isna(theme):
                value = value.split('_')[0]
                if not pd.isna(syn):
                    self.syn_dic[value] = syn
                    self.theme_dic[syn] = theme
                else:
                    self.theme_dic[value.split('_')[0]] = theme
            else:
                pass
    
    def DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPosting WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()       
    
    def pickle_to_table(self, filename, columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url']):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.df = pd.DataFrame(np.array(data), columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url'])
  
    def hashtags_split(self, hashtags):        
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))
        
        hashtags_list = []
        
        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list
        
        return hashtags_list

    
    def add_dictionary(self, *tokenized_list):
        origin_df = 1
        try:
            origin_df = pd.read_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8', header=None)
        except:
            print('No default intake_dic')
        keyword_list = []   
        for i in tokenized_list:
            if type(i) == list:
                for j in i:
                    j = j.split('_')
                    temp = [j[0],'' ,'' ,'' ,j[1],'*',j[2], j[3],'*','*','*','*','*']
                    keyword_list.append(temp)
            else:
                i = i.split('_')
                temp = [i[0],'','','',i[1],'*',i[2], i[3], '*','*','*','*','*']
                keyword_list.append(temp)


        keyword_df = pd.DataFrame(keyword_list)
        print(type(origin_df))
        if type(origin_df) != int:
            keyword_df = pd.concat((origin_df, keyword_df), ignore_index=True)
        else: 
            print('a')
            pass
        print(keyword_df.shape)

        keyword_df.to_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8',index=None, header=False)    

                
    def ngram(self, parsed_list):
        ngram_list = []        
        adjustment = 0
        # 단어_tag의 리스트

        for idx in range(len(parsed_list)):
            idx2 = idx + adjustment

            if (idx2+self.ngram_size) > (len(parsed_list)):
                ngram_list.extend(parsed_list[idx2:])
                break
            n_filter = tuple(parsed_list[idx2: idx2 + self.ngram_size])
            key = ''.join([k.split('_')[0] for k in n_filter])
            if key in self.ngram_dic:
                ngram_list.append(self.ngram_dic[key])
                adjustment += (self.ngram_size - 1)
            else:
                ngram_list.append(n_filter[0])

        if self.ngram_size <= 1:
            return ngram_list
        else:
            self.ngram_size -= 1
            return self.ngram(ngram_list)       

        
    def morph_pos(self, text_list,  mode='list'):
        
        morph_list = []
        
        for j in text_list:
            parsed = self.mecab.pos(j)
            temp = []
            for i in parsed:
                if self.isHangul(i[0]):
                    temp.append('{}_{}'.format(i[0], i[1]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            self.ngram_size = 6
            morph_list.append(self.ngram(temp))
            
        self.df['morph_list'] = morph_list
        
        return morph_list

    def filter_words(self, parsed_list, mode='syn'):
        # 1차원 리스트를 받음.
        
        if mode == 'None':
            return
        
        changed_list = list(map(lambda x: self.syn_dic.get(x, x) , parsed_list))
        deleted_list = list(filter(lambda x: x not in self.del_list, changed_list))
        
        if mode == 'theme':
            theme_list = list(map(lambda x: self.theme_dic.get(x, x) , deleted_list))
            return theme_list            
        else:
            return deleted_list

    def set_with_order(self, sequence):
        seen = set()
        result = [x for x in sequence if not (x in seen or seen.add(x))]
        return result

    
    def pos_extractor(self, parsed, mode = 'list', degree = 'syn'):
        
        
        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []
        total_list = [nav_list, noun_list, adj_list, verb_list]
        
        for j in parsed:
            nav_temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []
            temp_list = [nav_temp,  n_temp, adj_temp, verb_temp]
            
            for i in j:
                i = i.split('_')
                if self.isHangul(i[0]):
                    if (len(i[0]) > 1) or (i[0] in self.exception_list):                        
                        if 'NN' in i[1]:
                            n_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VV'in i[1]:
                            adj_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VA' in i[1]:
                            verb_temp.append(i[0])
                            nav_temp.append(i[0])
                    else: pass
                        #print('{} 제외'.format(i[0]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            
            for idx, li in enumerate(total_list):
                if mode == 'list':
                    li.append(temp_list[idx])
                elif mode == 'set':
                    li.append(self.set_with_order(self.filter_words(temp_list[idx], degree)))
                else:
                    print('Check mode')
                    return
            
            
        columns=['nav_list', 'noun_list', 'adj_list', 'verb_list']
        for i in  zip(columns, total_list):
            self.df[i[0]] = i[1]
            
        #return nav_list, noun_list, adj_list, verb_list # tuple(map(lambda x: [j.split('_')[0] for j in x], [nav_list, noun_list, adj_list, verb_list]))

    
    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    
    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list
 
    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited
        
    '''    
    def join_underbar(self, morph_list):

        all_list = []
        post_list=[]
        for i in morph_list:
            for j in i:
                post_list.append(j[0]+'_'+j[1])
            all_list.append([(' , ').join(post_list)])
            post_list=[] 
        all_list=np.array(all_list)
        
        return all_list'''

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item,item) for item in i]

        del sub_book
        gc.collect()

        return dataset
    
    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
             dataset[n] = [i for i in line if i not in del_list]

        return dataset

    
    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
    
    def convert_list(self, *tokenized_list):
        input_length = len(tokenized_list)
        lists = [[] for i in range(input_length)]

        for idx, li in enumerate(tokenized_list):
            for j in li:
                lists[idx].append(['/'.join(j)])

        converted_array = np.array(lists[0])
        for idx in range(input_length):
            try:
                converted_array = np.concatenate((converted_array, lists[idx + 1]), axis=1)
            except Exception as e:
                print(e,'끝')

        return converted_array

    def make_df(self, start_array, converted_array, end_array, columns=['user_id', 'created_at', 'main_text', 'morph_list', 'nav_list', 'noun_list', 'adj_list', 'verb_list', 'hashtags', 'comments', 'likes', 'current_url']):         
        df = pd.DataFrame(np.hstack((start_array, converted_array, end_array)), index=None, columns=columns)
        return df
    
    # 키워드 리스트 중 하나라도 있는 경우
    def word_check_or(self, text, keywords):
        if any(word in text for word in keywords):
            return 1
        else: return 0

    # 키워드 리스트에 있는 단어가 모두 있는 경우
    def word_check_and(self, text, keywords):
        if all(word in text for word in keywords):
            return 1
        else:
            return 0


    def word_check(self, method, keywords, df, column_name = 'main_text',filter_TF=True):
        
        filter_TF = 1 if filter_TF == True else 0
        if method == 'and':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_and(x, keywords))
            return df.loc[df['flags'] == filter_TF]

        elif method == 'or':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_or(x, keywords))
            return df.loc[df['flags'] == filter_TF]
        
        else:
            print('Select method, and/or')



In [2]:
intake = Social_analysis()

## pickle이나 db에서 데이터를 불러오는 순간 self.df 로 저장이 된다.

In [3]:
intake.pickle_to_table('Data/intake_list.txt')

In [4]:
intake.df = intake.word_check('or', ['자동차', '흡기', '배기','도어락'], intake.df, 'hashtags', False)
intake.df = intake.word_check('or', ['intakefoods', 
                                     'dameulstudio', 
                                     '_.ddo2', 
                                     '__scarlett.k', 
                                     '0.8l_korea', 
                                     'jiseung86', 
                                     'untactmarket', 
                                     'hyorin_papa', 
                                     'eighty4u', 
                                     'redamethyst3', 
                                     'hellovenus101', 
                                     'dearblossom.cake'], intake.df, 'user_id', False)
intake.df = intake.word_check('or', ['참여방법'], intake.df, 'main_text', False)

- temp = intake.df.loc[intake.df['main_text'].str.contains('맛') & intake.df['main_text'].str.contains('모닝죽') & intake.df['main_text'].str.contains('있')]['']
- temp.describe()

## morph_pos 함수를 실행하면 self.df['morph_list'] 에 저장된다.

In [5]:
intake.morph_list = intake.morph_pos(intake.df['main_text'])

## pos_extractor 함수는 더 이상 return을 하지 않고, self.df에 컬럼을 추가한다.
- 'nav_list', 'noun_list', 'adj_list', 'verb_list' 컬럼

## mode는 'list', 'set'모드가 있다. default는 list이다.
- list mode일 때에는 중복되는 단어를 제거하지 않는다. 
- 'set'모드 일 때는 중복된 단어를 제거한다.

## degree는 'none', 'syn', 'theme'이 있다. syn이 디폴트이다.
- 'none'은 형태소 분석까지 기준
- 'syn'은 csv컬럼기준, syn까지의 단어를 바꿔준다.
- theme 은 theme컬럼까지.

In [None]:
intake.pos_extractor(intake.morph_list, mode='list', degree='none')
intake.df['nav_list'][0]

In [None]:
intake.pos_extractor(intake.morph_list, mode='set', degree='none')
intake.df['nav_list'][0]

In [None]:
intake.pos_extractor(intake.morph_list, mode='set', degree='theme')
intake.df['nav_list'][0]

## 예전처럼 raw_list를 넣을 필요 없이 그냥 df[컬럼명] 으로 호출

In [None]:
intake.noun_merged = intake.merge_list(intake.df['noun_list'])
intake.noun_frequency = frequency(intake.noun_merged)
pprint(intake.noun_frequency)

In [None]:
intake.df['nav_list'][0]

In [None]:
intake.noun_joined = intake.join_list(intake.df['noun_list'])

In [None]:
intake.LDA = SB_LDA()
intake.LDA.make_lda(intake.nav_joined, ntopic=5, learning_method='batch', max_iter=25, random_state=0, n_words=20)

In [None]:
단어_태그_T/F(받침여부)_원단어의발음

In [None]:
keyword_dic = '''모닝죽,,,,NNG,*,T,모닝죽,*,*,*,*,*
인테이크,,,,NNG,*,F,인테이크,*,*,*,*,*
꿀고구마,,,,NNG,*,F,꿀고구마,*,*,*,*,*
한끼,,,,NNG,*,F,한끼,*,*,*,*,*
밀스라이트,,,,NNG,*,F,밀스라이트,*,*,*,*,*
식단인증,,,,NNG,*,F,식단인증,*,*,*,*,*
직장인,,,,NNG,*,T,직장인,*,*,*,*,*
스타그램,,,,NNG,*,T,스타그램,*,*,*,*,*
귀리우유,,,,NNG,*,F,귀리우유,*,*,*,*,*
텐바이텐,,,,NNG,*,T,텐바이텐,*,*,*,*,*
텐텐쇼퍼,,,,NNG,*,F,텐텐쇼퍼,*,*,*,*,*
간편식,,,,NNG,*,T,간편식,*,*,*,*,*
1일1식,,,,NNG,*,T,1일1식,*,*,*,*,*
카페라떼,,,,NNG,*,F,카페라떼,*,*,*,*,*
벨벳초콜렛,,,,NNG,*,T,벨벳초콜렛,*,*,*,*,*
혼밥,,,,NNG,*,T,혼밥,*,*,*,*,*
곤약현미밥,,,,NNG,*,T,곤약현미밥,*,*,*,*,*
버터치킨커리,,,,NNG,*,F,버터치킨커리,*,*,*,*,*
유지어터,,,,NNG,*,F,유지어터,*,*,*,*,*
치킨커리,,,,NNG,*,F,치킨커리,*,*,*,*,*
버터치킨,,,,NNG,*,F,버터치킨,*,*,*,*,*
밀스칩,,,,NNG,*,T,밀스칩,*,*,*,*,*
미래지향,,,,NNG,*,F,미래지향,*,*,*,*,*
유통기한,,,,NNG,*,T,유통기한,*,*,*,*,*
고농축,,,,NNG,*,T,고농축,*,*,*,*,*
단백질,,,,NNG,*,T,단백질,*,*,*,*,*
밀스드링크딸기,,,,NNG,*,F,밀스드링크딸기,*,*,*,*,*
프레시모닝,,,,NNG,*,T,프레시모닝,*,*,*,*,*
대체,,,,NNG,*,F,대체,*,*,*,*,*
클랜즈,,,,NNG,*,F,클랜즈,*,*,*,*,*
와디즈,,,,NNG,*,F,와디즈,*,*,*,*,*
얼리버드,,,,NNG,*,F,얼리버드,*,*,*,*,*
까페라떼,,,,NNG,*,F,까페라떼,*,*,*,*,*
로얄밀크티,,,,NNG,*,F,로얄밀크티,*,*,*,*,*
곤약젤리,,,,NNG,*,F,곤약젤리,*,*,*,*,*
모닝채소,,,,NNG,*,F,모닝채소,*,*,*,*,*
콜드프레스,,,,NNG,*,F,콜드프레스,*,*,*,*,*
하루야채,,,,NNG,*,F,하루야채,*,*,*,*,*
씨리얼,,,,NNG,*,T,씨리얼,*,*,*,*,*
콜드부르,,,,NNG,*,F,콜드부르,*,*,*,*,*
스타벅스,,,,NNG,*,F,스타벅스,*,*,*,*,*
밀스드링크,,,,NNG,*,F,밀스드링크,*,*,*,*,*
존맛,,,,VA,*,T,존맛,*,*,*,*,*
아워홈,,,,NNG,*,T,아워홈,*,*,*,*,*
모닝바게트,,,,NNG,*,F,모닝바게트,*,*,*,*,*
모닝귀리,,,,NNG,*,F,모닝귀리,*,*,*,*,*
스윗그레인,,,,NNG,*,T,스윗그레인,*,*,*,*,*
그릭요거트,,,,NNG,*,F,그릭요거트,*,*,*,*,*
팥톡스,,,,NNG,*,F,팥톡스,*,*,*,*,*
식이섬유,,,,NNG,*,F,식이섬유,*,*,*,*,*
베지밀,,,,NNG,*,T,베지밀,*,*,*,*,*
그레인하프,,,,NNG,*,F,그레인하프,*,*,*,*,*
코코넛하프,,,,NNG,*,F,코코넛하프,*,*,*,*,*
식이조절,,,,NNG,*,T,식이조절,*,*,*,*,*
줌마,,,,NNG,*,F,줌마,*,*,*,*,*
매일유업,,,,NNG,*,T,매일유업,*,*,*,*,*
볶음콩,,,,NNG,*,T,볶음콩,*,*,*,*,*
상하목장,,,,NNG,*,T,상하목장,*,*,*,*,*
요거트,,,,NNG,*,F,요거트,*,*,*,*,*
카카오닙스,,,,NNG,*,F,카카오닙스,*,*,*,*,*
잇클린,,,,NNG,*,T,잇클린,*,*,*,*,*
카페다이어트,,,,NNG,*,F,카페다이어트,*,*,*,*,*
사과맛워터젤리,,,,NNG,*,F,사과맛워터젤리,*,*,*,*,*
맛있는다이어트칩,,,,NNG,*,T,맛있는다이어트칩,*,*,*,*,*
다이어트칩,,,,NNG,*,T,다이어트칩,*,*,*,*,*'''.split('\n')

In [None]:
for idx, i in enumerate(keyword_dic):
    keyword_dic[idx] = i.split(',')

In [None]:
pd.DataFrame(keyword_dic).to_csv('c:\\mecab\\user-dic\\intake_dic.csv', encoding='utf-8', index=None, header=False)

In [None]:
'_'.join(['sadf'])

In [None]:
keyword_list = []


for idx in range(len(intake.morph_list)):
    print(intake.morph_list[idx])
    print(intake.modified[idx, 2])
    print(idx)
    k = input()    
    while k:
        keyword_list.append(k)
        k = input()
        if k == 'break' or k == 'next':
            break
    if k == 'break':
        break

## m이 False인 경우에는 진짜 맛있다는 것.
## -> '없' 이라는 음절이 '있'이라는 음절 이후 4음절 내에 없다는 뜻이다.
## m이 True인 경우에는 애매한 상황이다. '없' 이라는 음절이 4음절 내에 존재한다는 것이다.

In [None]:
import re
m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있진 않다.. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())

m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있다. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())


In [None]:
import pandas as pd
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
mecab.pos('모닝죽')

In [None]:
def helper(df, tag='NNG'):
    for i in range(len(df)):
        temp = mecab.pos(df.iloc[i,0])
        df.iloc[i, 0] = df.iloc[i,0] + '_' + df.iloc[i, 1]
        for idx, j in enumerate(temp):
            temp[idx] = '_'.join(list(j))
        df.iloc[i, 1] = '/'.join(temp)
    df.to_csv('a.csv', encoding='cp949')
    return df

helper(b)

a = '''곤약현미밥_NNG_T_곤약현미밥
버터치킨커리_NNG_F_버터치킨커리
유지어터_NNG_F_유지어터
치킨커리_NNG_F_치킨커리
버터치킨_NNG_F_버터치킨
귀리우유_NNG_F_귀리우유
텐바이텐_NNG_T_텐바이텐
텐텐쇼퍼_NNG_F_텐텐쇼퍼
식사대용_NNG_T_식사대용*
한끼_NNG_F_한끼
밀스라이트_NNG_F_밀스라이트
식단인증_NNG_F_식단인증
직장인_NNG_T_직장인
로얄밀크티_NNG_F_로얄밀크티
곤약젤리_NNG_F_곤약젤리
모닝채소_NNG_F_모닝채소
콜드프레스_NNG_F_콜드프레스
하루야채_NNG_F_하루야채
씨리얼_NNG_T_씨리얼
콜드부르_NNG_F_콜드부르
건강스타그램_NNG_T_건강스타그램
스타그램_NNG_T_스타그램
모닝죽_NNG_T_모닝죽
인테이크_NNG_F_인테이크
꿀고구마_NNG_F_꿀고구마
밀스칩_NNG_T_밀스칩
미래지향_NNG_F_미래지향*미래
고농축_NNG_T_고농축
단백질_NNG_T_단백질
밀스드링크딸기_NNG_F_밀스드링크딸기
프레시모닝_NNG_T_프레시모닝
대체_NNG_F_대체
클랜즈_NNG_F_클랜즈
얼리버드_NNG_F_얼리버드
까페라떼_NNG_F_까페라떼
스타벅스_NNG_F_스타벅스
밀스드링크_NNG_F_밀스드링크
존맛_VA_T_존맛
아워홈_NNG_T_아워홈
모닝바게트_NNG_F_모닝바게트
모닝귀리_NNG_F_모닝귀리
스윗그레인_NNG_T_스윗그레인
그릭요거트_NNG_F_그릭요거트
팥톡스_NNG_F_팥톡스
베지밀_NNG_T_베지밀
식이섬유_NNG_F_식이섬유
식이조절_NNG_T_식이조절
줌마_NNG_F_줌마
매일유업_NNG_T_매일유업
상하목장_NNG_T_상하목장
요거트_NNG_F_요거트
카카오닙스_NNG_F_카카오닙스
잇클린_NNG_T_잇클린
카페다이어트_NNG_F_카페다이어트
사과맛워터젤리_NNG_F_사과맛워터젤리'''

b= a.split('\n')

b = [i.split('_') for i in b]

b = pd.DataFrame(b)

b

In [81]:
for idx in range(len(intake.morph_list)):
    idx = idx + 2000
    print(intake.morph_list[idx])
    print(intake.modified[idx, 2])
    print(idx)
    k = input()    
    while k:
        keyword_list.append(k)
        k = input()
        if k == 'break' or k == 'next':
            break
    if k == 'break':
        break

['두_MM', '번_NNBC', '째_XSN', '샘플_NNG', '폭탄_NNG', '이뻐_VA+EC', '죽_VV', '겠_EP', '어_EF', '이거_NP', '먹_VV', '고_EC', '오빠_NNG', '가_JKS', '세계_NNG', '클래식_NNG', '선발전_NNG', '잘_MAG', '치루_VV', '올께_NNG', '사업_NNG', '의_JKG', '성공_NNG', '을_JKO', '기원_NNG', '하_XSV', '며_EC', '스테파노_NNP', '아멘_NNG', '외칩니다_VV+EF', '찍찍_MAG', '인테이크_NNG', '제품_NNG', '파워젤부스트_NNG', '식감_NNG', '이_JKS', '놀라웠_VA+EP', '으며_EC', '요새_NNG', '하루견과_NNG', '류_XSN', '는_JX', '견과류_NNG', '반_NNG', '말린_VV+ETM', '과일_NNG', '이_VCP', '라서_EC', '는_ETM', '거_NNB', '보다_JKB', '버리_VV', '게_NNB+JKS', '인_VCP+ETM', '반면_NNG', '닥터_NNG', '넛츠_NNP', '프리미엄_NNG', '골드_NNP', '통째_NNG', '로_JKB', '만_JX', '들_VV', '어_EC', '있_VX', '가성비_NNG', '갑_NNG', '내_NP+JKG', '스타일_NNG', '야_EC', '쌀_NNG', '싶_VX', '을_ETM', '땐_NNG+JX', '모닝죽_NNG', '단호박_NNG', '모닝죽고구마_NNG', '모닝죽단팥_NNG', '모닝죽귀리_NNG']
momsterz_kr Repost @hyunsukstagram (@get_repost)
・・・
두번째 샘플폭탄 이뻐죽겠어  이거먹고 오빠가  세계클래식 선발전 잘 치루고 올께� 사업의 성공을 기원하며�  스테파노 가  아멘 을 외칩니다  찍찍�
@muleah_ @leahmkim @momsterz_kr
 인테이크 제품의  파워젤부스트 의 식감이 놀라웠으며��� 요새  하

KeyboardInterrupt: 

In [78]:




str(np.nan)











'nan'