### 단어\_태그\_T/F(받침여부)_원단어의발음
### https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=4
# 명사 NNG, 동사 VV, 형용사 VA


In [1]:
from konlpy.tag import Mecab
import pickle
import re
import sys
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from pprint import pprint
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import mglearn
from pprint import pprint
import numpy as np
import gc
import copy
import pandas as pd


class SB_Word2Vec():    
    
    def __init__(self, morph_list):
        self.dct = Dictionary(morph_list)
        self.corpus = [self.dct.doc2bow(line) for line in morph_list]
        self.build_Word2Vec(morph_list)
    
    def make_Word2Vec(self, morph_list, size=50, window=2, min_count=10, iteration=100):
        self.em = Word2Vec(morph_list, size=size, window=window, min_count=min_count, iter=iteration)
        self.em_vocab = list(self.em.wv.vocab.keys())
        self.em_vocab_dic = {word:idx for idx, word in enumerate(self.em_vocab)}

    def make_Word2Sen_matrix(self): 
        vocab_size = len(self.em_vocab)
        self.sen_matrix = np.zeros((len(self.corpus), vocab_size))
        for idx, row in enumerate(self.sen_matrix):
            for idx2, frequency in self.corpus[idx]:
                    if self.dct[idx2] in self.em_vocab:
                        self.sen_matrix[idx][self.em_vocab_dic[self.dct[idx2]]] = frequency                
        self.sim_matrix = np.zeros((vocab_size, vocab_size))
        for idx, w1 in enumerate(self.em_vocab):
            for idx2, w2 in enumerate(self.em_vocab):
                self.sim_matrix[idx][idx2] =  self.em.wv.similarity(w1, w2)

        self.word2sen_matrix = np.dot(self.sim_matrix, np.transpose(self.sen_matrix))

        return self.word2sen_matrix

    def get_sim_sen(self, keyword, main_text, number=1):
        self.sim_sen_index = np.argsort(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        self.most_sim_sen_index = np.argmax(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        index_list = self.sim_sen_index.reshape((-1,)).tolist()
        index_list.reverse()
        
        for idx, i in enumerate(index_list[:number]):
            print(str(idx + 1))
            print(main_text[i])
        return index_list
    
    def build_Word2Vec(self, morph_list):
        self.make_Word2Vec(morph_list)
        self.make_Word2Sen_matrix()
        
        
class SB_LDA():

    def make_lda(self, morph_joined, ntopic=10, learning_method='batch', max_iter=25, random_state=0, n_words=20):        
        self.vect = CountVectorizer(max_features=10000, max_df=.15)
        self.X = self.vect.fit_transform(morph_joined)
        self.lda = LatentDirichletAllocation(n_components=ntopic, learning_method=learning_method, max_iter=max_iter, random_state=random_state)
        self.document_topics = self.lda.fit_transform(self.X)
        self.sorting = np.argsort(self.lda.components_, axis=1)[:, ::-1]
        self.feature_names = np.array(self.vect.get_feature_names())
        mglearn.tools.print_topics(topics=range(ntopic), feature_names=self.feature_names, sorting=self.sorting, topics_per_chunk=5, n_words=n_words)

    def related_doc(self, main_text_list, topic_index, number=10):
        category = np.argsort(self.document_topics[:, topic_index])[::-1]
        related_docs = []
        for i in category[:number]:
            print(i)
            print(main_text_list[i] + ".\n")
            related_docs.append((i, main_text_list[i]))
        return related_docs

class SB_Tfidf():    
    
    def __init__(self, list_morph_merged):
        self.list_morph_merged = list_morph_merged
        self.dct = Dictionary(self.list_morph_merged)
        self.corpus = [self.dct.doc2bow(line) for line in self.list_morph_merged]

    def get_tfidf(self):       
        self.model = TfidfModel(self.corpus)
        self.tfidf = []
        for i in self.corpus:
             self.tfidf.append(sorted(self.model[i], key = lambda x: x[1], reverse=True))
        self.tfidf_hangul = []
        for idx1, i in enumerate(self.tfidf):
            self.tfidf_hangul.append([(self.dct[j[0]], j[1]) for j in i])        
        
        return self.tfidf_hangul
    
def frequency(merged):
    word_count = Counter(merged)
    word_count2 = []
    for i in word_count:
        word_count2.append((i, word_count[i]))
    word_count2 = sorted(word_count2, key=lambda x: x[1], reverse = True)
    return word_count2




class Social_analysis():
    
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
    syn_dic = {}
    theme_dic = {}
    del_list = []
    ngram_dic = {}
    exception_list=['맛', '밥', '물', '몸', '없', '있', '싫', '달', '굳', '굿', '속']

    default_dic_path = 'Data/custom_dic.csv'
    replace_dic = 'Data/replace_dic.csv'
    
    def __init__(self):
        self.mecab = Mecab()
        try:
            self.load_dictionary()
        except Exception as e:
            print('dictionary error\n', e)
            
    def load_dictionary(self, mode='default'):
        path = self.default_dic_path
        self.dic_df = pd.read_csv(path, encoding='cp949')
        for i in range(len(self.dic_df)):
            key = self.dic_df.loc[i,'key']
            value = self.dic_df.loc[i, 'value']
            syn = self.dic_df.loc[i, 'syn']
            theme = self.dic_df.loc[i, 'theme']

            if pd.isna(value):
                print('Need key & value')
                return

            self.ngram_dic[key] = value
            
            if not pd.isna(theme):
                value = value.split('_')[0]
                if not pd.isna(syn):
                    self.syn_dic[value] = syn
                    self.theme_dic[syn] = theme
                else:
                    self.theme_dic[value.split('_')[0]] = theme
            else:
                pass
            
    def DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPosting WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()       

    def insta_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPosting WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
    
    def shop_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT mall, corpName, productCode, date, id, productScore, recommScore, main_text FROM VproductReview WHERE corpName = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
              
        
    def blog_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
              
    
    def pickle_to_table(self, filename, columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url']):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.df = pd.DataFrame(np.array(data), columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url'])

  
    def hashtags_split(self, hashtags):        
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))
        
        hashtags_list = []
        
        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list
        
        return hashtags_list

    
    def add_dictionary(self, *tokenized_list):
        origin_df = 1
        try:
            origin_df = pd.read_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8', header=None)
        except:
            print('No default intake_dic')
        keyword_list = []
        for i in tokenized_list:
            if type(i) == list:
                for j in i:
                    j = j.split('_')
                    temp = [j[0],'' ,'' ,'' ,j[1],'*',j[2], j[3],'*','*','*','*','*']
                    keyword_list.append(temp)
            else:
                i = i.split('_')
                temp = [i[0],'','','',i[1],'*',i[2], i[3], '*','*','*','*','*']
                keyword_list.append(temp)


        keyword_df = pd.DataFrame(keyword_list)
        print(type(origin_df))
        if type(origin_df) != int:
            keyword_df = pd.concat((origin_df, keyword_df), ignore_index=True)
        else: 
            print('a')
            pass
        print(keyword_df.shape)

        keyword_df.to_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8',index=None, header=False)    

        
    def ngram(self, parsed_list):
        ngram_list = []        
        adjustment = 0
        # 단어_tag의 리스트

        for idx in range(len(parsed_list)):
            idx2 = idx + adjustment

            if (idx2+self.ngram_size) > (len(parsed_list)):
                ngram_list.extend(parsed_list[idx2:])
                break
            n_filter = tuple(parsed_list[idx2: idx2 + self.ngram_size])
            key = ''.join([k.split('_')[0] for k in n_filter])
            if key in self.ngram_dic:
                ngram_list.append(self.ngram_dic[key])
                adjustment += (self.ngram_size - 1)
            else:
                ngram_list.append(n_filter[0])

        if self.ngram_size <= 1:
            return ngram_list
        else:
            self.ngram_size -= 1
            return self.ngram(ngram_list)       
        
    def morph_pos(self, text_list,  mode='list'):
        
        morph_list = []
        
        for j in text_list:
            parsed = self.mecab.pos(j)
            temp = []
            for i in parsed:
                if self.isHangul(i[0]):
                    temp.append('{}_{}'.format(i[0], i[1]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            self.ngram_size = 6
            morph_list.append(self.ngram(temp))
            
        self.df['morph_list'] = morph_list
        
        return morph_list


    def set_with_order(self, sequence):
        seen = set()
        result = [x for x in sequence if not (x in seen or seen.add(x))]
        return result
        
 
    def filter_words(self, parsed_list, mode='syn'):
        # 1차원 리스트를 받음.
        
        if mode == 'None':
            return
        
        changed_list = list(map(lambda x: self.syn_dic.get(x, x) , parsed_list))
        deleted_list = list(filter(lambda x: x not in self.del_list, changed_list))
        
        if mode == 'theme':
            theme_list = list(map(lambda x: self.theme_dic.get(x, x) , deleted_list))
            return theme_list            
        else:
            return deleted_list
    
    def pos_extractor(self, parsed, mode = 'list', degree = 'syn'):
        
        
        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []
        total_list = [nav_list, noun_list, adj_list, verb_list]
        
        for j in parsed:
            nav_temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []
            temp_list = [nav_temp,  n_temp, adj_temp, verb_temp]
            
            for i in j:
                i = i.split('_')
                if self.isHangul(i[0]):
                    if (len(i[0]) > 1) or (i[0] in self.exception_list):                        
                        if 'NN' in i[1]:
                            n_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VV'in i[1]:
                            adj_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VA' in i[1]:
                            verb_temp.append(i[0])
                            nav_temp.append(i[0])
                    else: pass
                        #print('{} 제외'.format(i[0]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            
            for idx, li in enumerate(total_list):
                if mode == 'list':
                    li.append(self.filter_words(temp_list[idx], degree))
                elif mode == 'set':
                    li.append(self.set_with_order(self.filter_words(temp_list[idx], degree)))
                else:
                    print('Check mode')
                    return
            
            
        columns=['nav_list', 'noun_list', 'adj_list', 'verb_list']
        for i in  zip(columns, total_list):
            self.df[i[0]] = i[1]
            
        #return nav_list, noun_list, adj_list, verb_list # tuple(map(lambda x: [j.split('_')[0] for j in x], [nav_list, noun_list, adj_list, verb_list]))

    
    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    
    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list
 
    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited
        
    '''    
    def join_underbar(self, morph_list):

        all_list = []
        post_list=[]
        for i in morph_list:
            for j in i:
                post_list.append(j[0]+'_'+j[1])
            all_list.append([(' , ').join(post_list)])
            post_list=[] 
        all_list=np.array(all_list)
        
        return all_list'''

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item,item) for item in i]

        del sub_book
        gc.collect()

        return dataset
    
    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
             dataset[n] = [i for i in line if i not in del_list]

        return dataset

    
    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
    
    def convert_list(self, *tokenized_list):
        input_length = len(tokenized_list)
        lists = [[] for i in range(input_length)]

        for idx, li in enumerate(tokenized_list):
            for j in li:
                lists[idx].append(['/'.join(j)])

        converted_array = np.array(lists[0])
        for idx in range(input_length):
            try:
                converted_array = np.concatenate((converted_array, lists[idx + 1]), axis=1)
            except Exception as e:
                print(e,'끝')

        return converted_array

    def make_df(self, start_array, converted_array, end_array, columns=['user_id', 'created_at', 'main_text', 'morph_list', 'nav_list', 'noun_list', 'adj_list', 'verb_list', 'hashtags', 'comments', 'likes', 'current_url']):         
        df = pd.DataFrame(np.hstack((start_array, converted_array, end_array)), index=None, columns=columns)
        return df
    
    # 키워드 리스트 중 하나라도 있는 경우
    def word_check_or(self, text, keywords):
        if any(word in text for word in keywords):
            return 1
        else: return 0

    # 키워드 리스트에 있는 단어가 모두 있는 경우
    def word_check_and(self, text, keywords):
        if all(word in text for word in keywords):
            return 1
        else:
            return 0


    def word_check(self, method, keywords, df, column_name = 'main_text',filter_TF=True):
        
        filter_TF = 1 if filter_TF == True else 0
        if method == 'and':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_and(x, keywords))
            return df.loc[df['flags'] == filter_TF]

        elif method == 'or':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_or(x, keywords))
            return df.loc[df['flags'] == filter_TF]
        
        else:
            print('Select method, and/or')



In [2]:
itkinsta = Social_analysis()
itkshop = Social_analysis()
itkblog = Social_analysis()
lnsinsta = Social_analysis()
lnsshop = Social_analysis()
lnsblog = Social_analysis()
pckinsta = Social_analysis()
pckshop = Social_analysis()
pckblog = Social_analysis()
kgcinsta = Social_analysis()
kgcshop = Social_analysis()
kgcblog = Social_analysis()


In [3]:
itkinsta.insta_DB_to_table(DBname = 'intake', keyword = 'intake')
itkshop.shop_DB_to_table('intake', 'intake')
itkblog.blog_DB_to_table('intake', 'intake')
lnsinsta.insta_DB_to_table(DBname = 'intake', keyword = 'labnosh')
lnsshop.shop_DB_to_table('intake', 'labnosh')
lnsblog.blog_DB_to_table('intake', 'labnosh')
pckinsta.pickle_to_table('Data\pck_list.txt')
pckshop.shop_DB_to_table('intake', 'pck')
pckblog.blog_DB_to_table('intake', 'pck')
kgcinsta.pickle_to_table('Data\kgc_list.txt')
kgcshop.shop_DB_to_table('intake', 'kgc')
kgcblog.blog_DB_to_table('intake', 'kgc')


In [30]:
itkinsta.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프'])
itkshop.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프'])
itkblog.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프'])

lnsinsta.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫'])
lnsshop.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫'])
lnsblog.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫'])

pckinsta.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥'])
pckshop.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥'])
pckblog.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥'])

kgcinsta.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈'])
kgcshop.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈'])
kgcblog.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈'])




###lnsinsta.del_list.extend([''])###

In [31]:
itkinsta.df

Unnamed: 0,user_id,created_at,main_text,hashtags,comments,likes,current_url
0,invincible_skhyun,2018-01-24 11:11:08,invincible_skhyun#식단일기_180124\n.\n아침:인테이크 소이밀크 블랙100\n점심:아임웰 굿밸런스라이스 (레드퀴노아 갈릭닭가슴살)+아임닭 훈제닭가슴살+채소믹스+브로콜리+아임닭 프랑크 1개\n저녁:곤약파스타+꼬꼬빌 그릴닭가슴살+사태수육 등\n운동:낸시 홈트레이닝 30분\n.\n점심은 굿밸런스라이스에 닭가슴살과 채소를 더해 한번 더 볶아 주었음. 저녁은 오뚜기 프레스코 스파게티소스에 채소믹스와 토마토를 넣어서 곤약면과 쉐킷쉐킷....\n.\n.\n#인테이크 #인테이크소이밀크 #아임웰 #아임웰굿밸런스라이스 #아임닭 #아임닭훈제닭가슴살 #아임닭소시지 #곤약파스타 #꼬꼬빌 #꼬꼬빌그릴닭가슴살,식단일기_180124/인테이크/인테이크소이밀크/아임웰/아임웰굿밸런스라이스/아임닭/아임닭훈제닭가슴살/아임닭소시지/곤약파스타/꼬꼬빌/꼬꼬빌그릴닭가슴살,,11,https://www.instagram.com/p/BeVH5d3luGZ/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
1,jiyu_lee,2018-01-24 08:37:57,jiyu_lee-\n요즘 먹고 있는 유산균.\n-\n인테이크에서 판매하는걸 보고 주문.\n인테이크는 밀스를 한 번 먹어봤는데\n꽤 괜찮았기에\n긴가민가 하며 일단 먹고 있다. -\n보통 유산균은 알약을 먹었는데\n이건 1회씩 개별포장된 가루를\n물에 흔들어 마시는 타입.\n-\n포장이며 다 마음에 드는데\n처음엔 물을 따라서 흔들어 마시고\n다시 용기를 씻어 두는 과정이\n좀 귀찮았다.\n지금은 익숙해져서 그나마 나은 편.\n-\n약간 분유맛 같은 첫 맛이지만\n마지막엔 너무 단게 흠.\n설탕 알갱이 같은게 바닥에 남아서\n한 번 더 흔들어 마셔야 한다.\n이건 좀 바뀌었으면.\n-\n유산균 덕분인지 모르겠지만\n화장실에 잘 가고 있음.\n두 달 분 사두었는데\n괜찮으면 장복하는 걸로. -\n-\n-\n#인테이크 #intake #휴먼바이오틱스 #유산균 #포장이예쁘니 #효과도좋을거같고,인테이크/intake/휴먼바이오틱스/유산균/포장이예쁘니/효과도좋을거같고,,19,https://www.instagram.com/p/BeU2Xe0BWpD/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
2,dionycchus,2018-01-24 07:43:53,dionycchus#인테이크\n올해엔 건강해져볼까 합니다ㅎㅎ\n이러고 또 술처먹겠지🤣\n健康のために！,인테이크,,11,https://www.instagram.com/p/BeUwLe9lTrg/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
3,invincible_skhyun,2018-01-23 12:55:54,invincible_skhyun#식단일기_180123\n\n다시 마음잡고 시작해보는 식단일기!\n.\n아침: 인테이크 소이밀크 퓨어100\n점심: 아임웰 라이트밀 카레큐브\n간식: 반숙계란 1개\n저녁: 아임닭 스테이크 매콤단호박+토마토+브로콜리+풀무원낫토+파리바게트 미니치즈케익 1/4\n운동: 기구필라테스 50분\n.\n필라테스는 역시 재미있다. 아임닭 스테이크는 처음 먹어보는데 굿굿....\n.\n#인테이크 #인테이크소이밀크 #아임웰 #아임웰라이트밀 #아임닭 #아임닭스테이크 #토미토 #브로콜리 #풀무원 #풀무원생낫토 #치즈케익 #식단일기,식단일기_180123/인테이크/인테이크소이밀크/아임웰/아임웰라이트밀/아임닭/아임닭스테이크/토미토/브로콜리/풀무원/풀무원생낫토/치즈케익/식단일기/park_jin_huk/shin_jii0/iamwell_official/iamwell_official/iamwell_official,,,https://www.instagram.com/p/BeSvF-eF6wk/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
4,diet.oneul,2018-01-23 12:09:54,diet.oneul#오늘의식단\n#저녁\n인테이크 모닝죽 단팥.\n냉동야채+닭쌤닭가슴살소세지.\n#식단 #식단일기 #인테이크 #모닝죽,오늘의식단/저녁/식단/식단일기/인테이크/모닝죽,,15,https://www.instagram.com/p/BeSp1AIl7tT/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
5,styleshare_beauty,2018-01-23 08:13:52,styleshare_beauty[#뷰티실험실 : 다이어트 핫템 4🔥]\n굶는 다이어트는 이제 그만! 건강한 다이어트를 도와줄 #인테이크 의 핫템 4가지를 먹어봤어요🥛🍶🍪💊 이젠 다이어트 대용식으로 똑똑하게 다이어트해요🙌\n✔️스쉐스토어에서 최대 55% 할인중!\n-\n#스타일쉐어 #스쉐스토어 #다이어트 #대용식 #식단조절,뷰티실험실/인테이크/스타일쉐어/스쉐스토어/다이어트/대용식/식단조절,,,https://www.instagram.com/p/BeSOvOigMHJ/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
6,ddong940423,2018-01-23 07:59:04,ddong940423#intake #인테이크 #밀스3 #이벤트\n1.23 무료체험 이벤트. 아침 굶을 필요가 없어지겠다,intake/인테이크/밀스3/이벤트/stopsilver425/hello.oioi/ddong940423/stopsilver425/gyeong._2/ddong940423,,19,https://www.instagram.com/p/BeSNHz-jPT5/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
7,ssujjeong_,2018-01-23 07:32:40,ssujjeong_군것질을 절대 하지않기위해 다이어트쿠키를 샀당\n다욧하는사람들 많우니까 4박스💪ㅋㅋ\n리아가 챙겨보내준 다른애들도 마시또😗\n.\n.\n.\n#몸스터즈#다이어트쿠키#다이어트시작한닷,몸스터즈/다이어트쿠키/다이어트시작한닷/운동하는여자/운동하는남자/운동/헬스타그램/비키니선수/비키니모델/다이어트/다이어트간식/다이어트식단/인테이크/모닝죽/다이어터/벌크업/살크업/광주/광주헬스/광주pt/팀준/일상/데일리/오오티디/ootd/셀카/셀스타그램/selfie/leahmkim/gracejeong_/jung_es_,ssujjeong_.\n.\n.\n#운동하는여자#운동하는남자#운동#헬스타그램#비키니선수#비키니모델#다이어트#다이어트간식#다이어트식단#인테이크#모닝죽#다이어터#벌크업#살크업#광주#광주헬스#광주pt#팀준#일상#데일리#오오티디#ootd#셀카#셀스타그램#selfie/leahmkim아이고 고갱님♥️♥️😍/ssujjeong_@leahmkim ㅋㅋ네 대펴님ㅋㅋ죽 마시땅😛😛💕/parksoh0324좋아요 꾹 누르고 갑니다 ^^/jung_es_진짜맛잇음ㅋㅋㅋ👍👍/topfrancesnap😁👍와우/gracejeong_언니 맛있어요? 크크 추천받아야겠넹😻/ssujjeong_@gracejeong_ 맛있다구하면 살찔것만같앙😐ㅋㅋ 카카오로만 샀는뎅 나는 오리지널이 맛있졉ㅋㅋ 담백한 쿠키🍪👍/ssujjeong_@jung_es_ 언니 내가 지금 먹어봤눈뎅 다른맛이 더 맛나당ㅋㅋ,132,https://www.instagram.com/p/BeSKGh5jRet/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
8,dameulstudio,2018-01-23 05:54:14,dameulstudio모닝죽 우유🥛\n_\n_\nClient: 인테이크\nFoodstyling: @foodstylesun\nPhotographer: @wonkyup\n_\n_\n#다믈스튜디오 #푸드스튜디오 #송파 #인테이크 #모닝죽 #우유 #다이어트 #스튜디오 #프랜차이즈 #메뉴촬영 #음식사진 #푸드스타일링 #푸드스타일리스트 #브런치 #온더테이블 #먹스타그램 #플레이팅 #푸드포토그래퍼 #푸드포토 #일상 #food #foodie #foodstagram #photography #foodphotography #milk #soup,foodstylesun/wonkyup/다믈스튜디오/푸드스튜디오/송파/인테이크/모닝죽/우유/다이어트/스튜디오/프랜차이즈/메뉴촬영/음식사진/푸드스타일링/푸드스타일리스트/브런치/온더테이블/먹스타그램/플레이팅/푸드포토그래퍼/푸드포토/일상/food/foodie/foodstagram/photography/foodphotography/milk/soup,rimrim0408행복한 하루 되세요!/baggieatsim so hungry right now :),71,https://www.instagram.com/p/BeR-1iJHwmH/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
9,catharen_,2018-01-23 00:22:17,catharen_:\n생명 유지 수단이라고나 할까\n\n#일상 #작업일기 #인테이크 #두유 #모닝스타그램 #dailylife #dailywork #soymilkforhealth #morningstagram,일상/작업일기/인테이크/두유/모닝스타그램/dailylife/dailywork/soymilkforhealth/morningstagram,harumulgorae잘보고 가용!/tangbong_91잘보고 가요 소통해요 😎,48,https://www.instagram.com/p/BeRY2S6FDRW/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC


In [4]:
itkinsta.df = itkinsta.word_check('or', ['자동차', '흡기', '배기','도어락'], itkinsta.df, 'hashtags', False)
itkinsta.df = itkinsta.word_check('or', ['intakefoods', 'dameulstudio', '_.ddo2', '__scarlett.k', '0.8l_korea', 'jiseung86', 'untactmarket'], itkinsta.df, 'user_id', False)
itkinsta.df = itkinsta.word_check('or', ['자동차', '흡기', '배기','도어락'], itkinsta.df, 'main_text', False)


conver

temp = df_new_intake.loc[df_new['main_text'].str.contains('맛') & df_new['main_text'].str.contains('모닝죽') & df_new['main_text'].str.contains('있')]['']
temp.describe()

In [5]:
itkinsta.morph_pos(itkinsta.df['main_text'])

itkshop.morph_pos(itkshop.df['main_text'])

itkblog.morph_pos(itkblog.df['main_text'])

lnsinsta.morph_pos(lnsinsta.df['main_text'])

lnsshop.morph_pos(lnsshop.df['main_text'])

lnsblog.morph_pos(lnsblog.df['main_text'])

pckinsta.morph_pos(pckinsta.df['main_text'])

pckshop.morph_pos(pckshop.df['main_text'])

pckblog.morph_pos(pckblog.df['main_text'])

kgcinsta.morph_pos(kgcinsta.df['main_text'])

kgcshop.morph_pos(kgcshop.df['main_text'])

kgcblog.morph_pos(kgcblog.df['main_text'])


[['월드_NNG',
  '디제이_NNG',
  '페스티벌_NNG',
  '후기_NNG',
  '월_NNG',
  '디페_NNG',
  '다녀왔_VV+EP',
  '어요_EF',
  '정관장_NNG',
  '에브리타임_NNG',
  '먹_VV',
  '고_EC',
  '버_NNP',
  '닝_NNP',
  '내_NP+JKG',
  '생애_NNG',
  '첫_MM',
  '월드_NNG',
  '디제이_NNG',
  '페스티벌_NNG',
  '다른_MM',
  '축제_NNG',
  '는_JX',
  '많이_MAG',
  '가_VV+EC',
  '봤_VX+EP',
  '는데_EC',
  '월_NNG',
  '디페_NNG',
  '랑_JKB',
  '은_JX',
  '이상_NNG',
  '하_XSV',
  '게_EC',
  '인연_NNG',
  '이_JKS',
  '없_VA',
  '더라고요_EC',
  'ㅠㅠ_UNKNOWN',
  '올해_NNG',
  '는_JX',
  '드디어_MAG',
  '다녀왔_VV+EP',
  '어요_EF',
  '저_NP',
  '는_JX',
  '일요_NNG',
  '일_NR',
  '원_NNBC',
  '데이_NNG',
  '권_XSN',
  '으로_JKB',
  '다녀왔_VV+EP',
  '는데_EC',
  '토요일_NNG',
  '은_JX',
  '많이_MAG',
  '더웠_VA+EP',
  '다고_EC',
  '하_VV',
  '더라고요_EC',
  '일요일_NNG',
  '선택_NNG',
  '한_XSA+ETM',
  '거_NNB',
  '아주_MAG',
  '칭찬_NNG',
  '해_XSV+EC',
  '잠실_NNP',
  '종합_NNG',
  '운동장_NNG',
  '서울특별시_NNP',
  '송파구_NNP',
  '올림픽로_NNP',
  '서울_NNP',
  '종합_NNG',
  '운동장_NNG',
  '지도_NNG',
  '보_VV',
  '기_ETN',
  '우리_NP',
  '집_NNG',
  '에서_JKB',
  '

In [34]:
kgcshop.df['morph_list']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   [교수_NNG, 님_XSN, 께_JKB, 드릴_VV+ETM, 선물_NNG, 매번_MAG, 요기_NP, 서_JKB, 주문_NNG, 해요_XSV+EF, 현대_NNG, 백화점_NNG, 에서_JKB, 가_VV, 는_ETM, 거_NNB, 로_JKB, 가_VV, 구요_EF, ㅜㅜ_UNKNOWN, 이번_NNG, 부터_JX, 우체국_NNG, 택배_NNG, 라_VCP+EC, 별_MM, 하나_NR, 빼_VV, 요_EC, 기존_NNG, 처럼_JKB, 현대_NNG, 백화점_NNG, 서_JKB, 같이_MAG, 보내_VV+EC, 주_VX, 지_EF, ㅜㅜ_UNKNOWN]
1                                                                                                                                                                                              

In [6]:
itkinsta.pos_extractor(itkinsta.df['morph_list'], 'list', None)
itkshop.pos_extractor(itkshop.df['morph_list'], 'list', None)
itkblog.pos_extractor(itkblog.df['morph_list'], 'list', None)

lnsinsta.pos_extractor(lnsinsta.df['morph_list'], 'list', None)
lnsshop.pos_extractor(lnsshop.df['morph_list'], 'list', None)
lnsblog.pos_extractor(lnsblog.df['morph_list'], 'list', None)

pckinsta.pos_extractor(pckinsta.df['morph_list'], 'list', None)
pckshop.pos_extractor(pckshop.df['morph_list'], 'list', None)
pckblog.pos_extractor(pckblog.df['morph_list'], 'list', None)

kgcinsta.pos_extractor(kgcinsta.df['morph_list'], 'list', None)
kgcshop.pos_extractor(kgcshop.df['morph_list'], 'list', None)
kgcblog.pos_extractor(kgcblog.df['morph_list'], 'list', None)

In [7]:
itkinsta.nav_merged = itkinsta.merge_list(itkinsta.df['nav_list'])
itkshop.nav_merged = itkshop.merge_list(itkshop.df['nav_list'])
itkblog.nav_merged = itkblog.merge_list(itkblog.df['nav_list'])

lnsinsta.nav_merged = lnsinsta.merge_list(lnsinsta.df['nav_list'])
lnsshop.nav_merged = lnsshop.merge_list(lnsshop.df['nav_list'])
lnsblog.nav_merged = lnsblog.merge_list(lnsblog.df['nav_list'])

pckinsta.nav_merged = pckinsta.merge_list(pckinsta.df['nav_list'])
pckshop.nav_merged = pckshop.merge_list(pckshop.df['nav_list'])
pckblog.nav_merged = pckblog.merge_list(pckblog.df['nav_list'])

kgcinsta.nav_merged = kgcinsta.merge_list(kgcinsta.df['nav_list'])
kgcshop.nav_merged = kgcshop.merge_list(kgcshop.df['nav_list'])
kgcblog.nav_merged = kgcblog.merge_list(kgcblog.df['nav_list'])


In [8]:
itkinsta.nav_merged

['식단',
 '일기',
 '아침',
 '인테이크',
 '소이',
 '밀크',
 '블랙',
 '점심',
 '아임',
 '굿',
 '밸런스',
 '라이스',
 '레드',
 '갈릭',
 '가슴살',
 '아임',
 '훈제',
 '가슴살',
 '채소',
 '믹스',
 '브로콜리',
 '아임',
 '프랑크',
 '저녁',
 '곤약',
 '파스타',
 '그릴',
 '가슴살',
 '사태',
 '수육',
 '운동',
 '낸시',
 '트레이닝',
 '점심',
 '굿',
 '밸런스',
 '라이스',
 '가슴살',
 '채소',
 '저녁',
 '오뚜기',
 '프레스코',
 '스파게티',
 '소스',
 '채소',
 '믹스',
 '토마토',
 '곤약',
 '인테이크',
 '인테이크',
 '소이',
 '밀크',
 '아임',
 '아임',
 '굿',
 '밸런스',
 '라이스',
 '아임',
 '아임',
 '훈제',
 '가슴살',
 '아임',
 '소시지',
 '곤약',
 '파스타',
 '그릴',
 '가슴살',
 '요즘',
 '유산균',
 '인테이크',
 '판매',
 '주문',
 '인테이크',
 '밀스',
 '괜찮',
 '보통',
 '유산균',
 '알약',
 '개별',
 '포장',
 '가루',
 '물',
 '흔들',
 '마시',
 '타입',
 '포장',
 '마음',
 '처음',
 '물',
 '따라서',
 '흔들',
 '마시',
 '용기',
 '과정',
 '귀찮',
 '분유',
 '맛',
 '맛',
 '마지막',
 '단게',
 '설탕',
 '알갱이',
 '바닥',
 '흔들',
 '마셔야',
 '바뀌',
 '유산균',
 '덕분',
 '모르',
 '화장실',
 '달',
 '괜찮',
 '장복',
 '걸로',
 '인테이크',
 '휴먼바이오틱스A1',
 '유산균',
 '포장',
 '예쁘',
 '효과',
 '인테이크',
 '올해',
 '건강',
 '이러',
 '처먹',
 '식단',
 '일기',
 '마음잡',
 '시작',
 '식단',
 '일기',
 '아침',
 '인테이크',
 '소이밀크퓨어',
 '점심',


In [9]:
'독립운동' in itkinsta.nav_merged

True

In [16]:
'모닝죽단호박' in itkshop.nav_merged

True

In [10]:
pd.options.display.max_colwidth = 2000

In [11]:
itk_merged = itkinsta.nav_merged + itkshop.nav_merged + itkblog.nav_merged
lns_merged = lnsinsta.nav_merged + lnsshop.nav_merged + lnsblog.nav_merged
pck_merged = pckinsta.nav_merged + pckshop.nav_merged + pckblog.nav_merged
kgc_merged = kgcinsta.nav_merged + kgcshop.nav_merged + kgcblog.nav_merged

In [12]:
tfidf = SB_Tfidf([itk_merged, lns_merged, pck_merged, kgc_merged])
tfidf.get_tfidf()
tfidf_of_all = tfidf.tfidf_hangul

for i in tfidf_of_all:
    pprint(i[:40])
    print()

[('인테이크', 0.7669878331906439),
 ('밀스', 0.49418232270261303),
 ('모닝죽', 0.14963809470434392),
 ('파워젤부스트', 0.13029457176448128),
 ('식사대용', 0.11945120183700719),
 ('밀스라이트', 0.1017368574051429),
 ('고구마죽', 0.07897992877504516),
 ('모닝귀리', 0.0771950716275865),
 ('모닝죽단호박', 0.07518710733669552),
 ('슈퍼바', 0.07184050018521056),
 ('아미노리커버', 0.07094807161148124),
 ('미래식사', 0.06894010732059026),
 ('다이어트칩', 0.05979271443986469),
 ('홍삼젤리스틱', 0.05577678585808274),
 ('모닝죽꿀고구마', 0.048637357268248145),
 ('아침대용', 0.04611371977893766),
 ('현정', 0.04506764297333085),
 ('밀스소이', 0.04060550010468423),
 ('향신료', 0.04060550010468423),
 ('모닝그래놀라', 0.039936178674387236),
 ('휴먼바이오틱스A1', 0.03837442867036092),
 ('미래형', 0.03792821438349626),
 ('밀스하프', 0.037258892953199264),
 ('독립운동', 0.03658957152290227),
 ('우유죽', 0.03658957152290227),
 ('모닝죽우유', 0.03480471437544363),
 ('프립', 0.03301985722798498),
 ('컴포트', 0.03257364294112032),
 ('미숫가루', 0.032224045146727526),
 ('슈퍼스무디', 0.032127428654255656),
 ('모닝죽고구마', 0.03123500008052

## csv에 안쓰고 이런식으로도 추가 가능. 하지만 csv에 자동저장되진 않는다.

단어_태그_T/F(받침여부)_원단어의발음

## m이 False인 경우에는 진짜 맛있다는 것.
## -> '없' 이라는 음절이 '있'이라는 음절 이후 4음절 내에 없다는 뜻이다.
## m이 True인 경우에는 애매한 상황이다. '없' 이라는 음절이 4음절 내에 존재한다는 것이다.

In [20]:
import re
m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있진 않다.. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())

m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있다. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())


<_sre.SRE_Match object; span=(7, 14), match='맛이 있진 않'>
맛 없다는 뜻
원문: 맛이 있진 않
None
맛있다는 뜻
