### 단어\_태그\_T/F(받침여부)_원단어의발음
### https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=4
# 명사 NNG, 동사 VV, 형용사 VA


In [16]:
from konlpy.tag import Mecab
import pickle
import re
import sys
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from pprint import pprint
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import mglearn
from pprint import pprint
import numpy as np
import gc
import copy
import pandas as pd


class SB_Word2Vec():    
    
    def __init__(self, morph_list):
        self.dct = Dictionary(morph_list)
        self.corpus = [self.dct.doc2bow(line) for line in morph_list]
        self.build_Word2Vec(morph_list)
    
    def make_Word2Vec(self, morph_list, size=50, window=2, min_count=10, iteration=100):
        self.em = Word2Vec(morph_list, size=size, window=window, min_count=min_count, iter=iteration)
        self.em_vocab = list(self.em.wv.vocab.keys())
        self.em_vocab_dic = {word:idx for idx, word in enumerate(self.em_vocab)}

    def make_Word2Sen_matrix(self): 
        vocab_size = len(self.em_vocab)
        self.sen_matrix = np.zeros((len(self.corpus), vocab_size))
        for idx, row in enumerate(self.sen_matrix):
            for idx2, frequency in self.corpus[idx]:
                    if self.dct[idx2] in self.em_vocab:
                        self.sen_matrix[idx][self.em_vocab_dic[self.dct[idx2]]] = frequency                
        self.sim_matrix = np.zeros((vocab_size, vocab_size))
        for idx, w1 in enumerate(self.em_vocab):
            for idx2, w2 in enumerate(self.em_vocab):
                self.sim_matrix[idx][idx2] =  self.em.wv.similarity(w1, w2)

        self.word2sen_matrix = np.dot(self.sim_matrix, np.transpose(self.sen_matrix))

        return self.word2sen_matrix

    def get_sim_sen(self, keyword, main_text, number=1):
        self.sim_sen_index = np.argsort(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        self.most_sim_sen_index = np.argmax(self.word2sen_matrix[self.em_vocab_dic[keyword]])
        index_list = self.sim_sen_index.reshape((-1,)).tolist()
        index_list.reverse()
        
        for idx, i in enumerate(index_list[:number]):
            print(str(idx + 1))
            print(main_text[i])
        return index_list
    
    def build_Word2Vec(self, morph_list):
        self.make_Word2Vec(morph_list)
        self.make_Word2Sen_matrix()
        
        
class SB_LDA():

    def make_lda(self, morph_joined, ntopic=10, learning_method='batch', max_iter=25, random_state=0, n_words=20):        
        self.vect = CountVectorizer(max_features=10000, max_df=.15)
        self.X = self.vect.fit_transform(morph_joined)
        self.lda = LatentDirichletAllocation(n_components=ntopic, learning_method=learning_method, max_iter=max_iter, random_state=random_state)
        self.document_topics = self.lda.fit_transform(self.X)
        self.sorting = np.argsort(self.lda.components_, axis=1)[:, ::-1]
        self.feature_names = np.array(self.vect.get_feature_names())
        mglearn.tools.print_topics(topics=range(ntopic), feature_names=self.feature_names, sorting=self.sorting, topics_per_chunk=5, n_words=n_words)

    def related_doc(self, main_text_list, topic_index, number=10):
        category = np.argsort(self.document_topics[:, topic_index])[::-1]
        related_docs = []
        for i in category[:number]:
            print(i)
            print(main_text_list[i] + ".\n")
            related_docs.append((i, main_text_list[i]))
        return related_docs

class SB_Tfidf():    
    
    def __init__(self, list_morph_merged):
        self.list_morph_merged = list_morph_merged
        self.dct = Dictionary(self.list_morph_merged)
        self.corpus = [self.dct.doc2bow(line) for line in self.list_morph_merged]

    def get_tfidf(self):       
        self.model = TfidfModel(self.corpus)
        self.tfidf = []
        for i in self.corpus:
             self.tfidf.append(sorted(self.model[i], key = lambda x: x[1], reverse=True))
        self.tfidf_hangul = []
        for idx1, i in enumerate(self.tfidf):
            self.tfidf_hangul.append([(self.dct[j[0]], j[1]) for j in i])        
        
        return self.tfidf_hangul
    
def frequency(merged):
    word_count = Counter(merged)
    word_count2 = []
    for i in word_count:
        word_count2.append((i, word_count[i]))
    word_count2 = sorted(word_count2, key=lambda x: x[1], reverse = True)
    return word_count2




class Social_analysis():
    
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
    syn_dic = {}
    theme_dic = {}
    del_list = []
    ngram_dic = {}
    exception_list=['맛', '밥', '물', '몸', '없', '있', '싫', '달', '굳', '굿', '속']

    default_dic_path = 'Data/custom_dic.csv'
    replace_dic = 'Data/replace_dic.csv'
    
    def __init__(self, corpname):
        self.mecab = Mecab()
        self.corp_name = corpname
        
        try:
            self.load_dictionary()
        except Exception as e:
            print('dictionary error\n', e)
            
    def load_dictionary(self):
        path = self.default_dic_path
        self.dic_df = pd.read_csv(path, encoding='cp949')
        self.dic_dif = self.dic_df.astype(str)
        for i in range(len(self.dic_df)):
            key = self.dic_df.loc[i,'key']
            value = self.dic_df.loc[i, 'value']
            syn = self.dic_df.loc[i, 'syn']
            theme = self.dic_df.loc[i, 'theme']

            if pd.isna(value):
                print('Need key & value')
                return
            self.ngram_dic[key] = value
            
            if not pd.isna(syn):
                self.syn_dic[value.split('_')[0]] = syn
            
            if not pd.isna(theme):
                self.theme_dic[value.split('_')[0]] = theme
            
    def DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT LBip, CDNip, logip, logdate, before_url, after_url, device_info, device, os, app FROM LogIntake
        """
        conn = pymssql.connect("175.114.47.85", "gh", "ghintake", 'intake')
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()       
    
    def sort_by_ip(self):
        ip_list=[]
        for i in range(len(self.df)):
            ip_list.append(self.df['logip'][i])
        self.ip_set = list(set(ip_list))
    
    def insta_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPosting WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        
    def yj1_DB_to_table(self, DBname='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        Select main_text 
        from instaposting 
        where keyword = 'intake'
        and main_text like N'%%선물%%'
        and user_id not like 'intakefoods'
        and main_text not like '%%Regrann%%'
        and main_text not like '%%repost%%'
        """
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        
    def yj2_DB_to_table(self, DBname='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        select main_text
        from instaposting
        where keyword = 'labnosh'
        and main_text like N'%%선물%%'
        and user_id not like 'atemshop.official'
        and main_text not like '%%labnosh.official%%'
        and main_text not like '%%Regrann%%'
        and main_text not like '%%labnosh_official%%'
        and main_text not like '%%repost%%'
        """
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        
    
    
    def shop_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT mall, keyword, productCode, date, user_id, productScore, recommScore, main_text FROM VproductReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
              
        
    def blog_DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect("intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname)
        self.df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
              
    
    def pickle_to_table(self, filename, columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url']):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.df = pd.DataFrame(np.array(data), columns=['user_id', 'created_at', 'main_text', 'hashtags', 'comments', 'likes', 'current_url'])

  
    def hashtags_split(self, hashtags):        
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))
        
        hashtags_list = []
        
        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list
        
        return hashtags_list

    
    def add_dictionary(self, *tokenized_list):
        origin_df = 1
        try:
            origin_df = pd.read_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8', header=None)
        except:
            print('No default intake_dic')
        keyword_list = []
        for i in tokenized_list:
            if type(i) == list:
                for j in i:
                    j = j.split('_')
                    temp = [j[0],'' ,'' ,'' ,j[1],'*',j[2], j[3],'*','*','*','*','*']
                    keyword_list.append(temp)
            else:
                i = i.split('_')
                temp = [i[0],'','','',i[1],'*',i[2], i[3], '*','*','*','*','*']
                keyword_list.append(temp)


        keyword_df = pd.DataFrame(keyword_list)
        print(type(origin_df))
        if type(origin_df) != int:
            keyword_df = pd.concat((origin_df, keyword_df), ignore_index=True)
        else: 
            print('a')
            pass
        print(keyword_df.shape)

        keyword_df.to_csv("C:\\mecab\\user-dic\\intake_dic.csv", encoding='utf-8',index=None, header=False)    

        
    def ngram(self, parsed_list):
        ngram_list = []        
        adjustment = 0
        # 단어_tag의 리스트

        for idx in range(len(parsed_list)):
            idx2 = idx + adjustment

            if (idx2+self.ngram_size) > (len(parsed_list)):
                ngram_list.extend(parsed_list[idx2:])
                break
            n_filter = tuple(parsed_list[idx2: idx2 + self.ngram_size])
            key = ''.join([k.split('_')[0] for k in n_filter])
            if key in self.ngram_dic:
                ngram_list.append(self.ngram_dic[key])
                adjustment += (self.ngram_size - 1)
            else:
                ngram_list.append(n_filter[0])

        if self.ngram_size <= 1:
            return ngram_list
        else:
            self.ngram_size -= 1
            return self.ngram(ngram_list)       
        
    def morph_pos(self, text_list,  mode='list'):
        
        morph_list = []
        
        for j in text_list:
            parsed = self.mecab.pos(j)
            temp = []
            for i in parsed:
                if self.isHangul(i[0]):
                    temp.append('{}_{}'.format(i[0], i[1]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            self.ngram_size = 6
            morph_list.append(self.ngram(temp))
            
        self.df['morph_list'] = morph_list
        
        return morph_list


    def set_with_order(self, sequence):
        seen = set()
        result = [x for x in sequence if not (x in seen or seen.add(x))]
        return result
        
 
    def filter_words(self, parsed_list, mode='syn'):
        # 1차원 리스트를 받음.

        if mode == None:
            deleted_list = [f for f in filter(lambda x: x not in self.del_list, parsed_list)]
            return deleted_list
        
        elif mode == 'syn':
            syn_list = [f.format(self.corp_name) for f in map(lambda x: self.syn_dic.get(x, x), parsed_list)]
            deleted_list = [f for f in filter(lambda x: x not in self.del_list, syn_list)]
            return deleted_list
        
        elif mode == 'theme':
            theme_list = [f.format(self.corp_name) for f in map(lambda x: self.theme_dic.get(x, x), parsed_list)]
            deleted_list = [f for f in filter(lambda x: x not in self.del_list, theme_list)]
            return deleted_list 
        
        else:
            return deleted_list
    
    def pos_extractor(self, parsed, mode = 'list', degree = 'syn'):
        
        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []
        total_list = [nav_list, noun_list, adj_list, verb_list]
        
        for j in parsed:
            nav_temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []
            temp_list = [nav_temp,  n_temp, adj_temp, verb_temp]
            
            for i in j:
                i = i.split('_')
                if self.isHangul(i[0]):
                    if (len(i[0]) > 1) or (i[0] in self.exception_list):                        
                        if 'NN' in i[1]:
                            n_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VV'in i[1]:
                            adj_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif 'VA' in i[1]:
                            verb_temp.append(i[0])
                            nav_temp.append(i[0])
                    else: pass
                        #print('{} 제외'.format(i[0]))
                else: pass#print('{} 한글이 아님.'.format(i[0]))

            
            for idx, li in enumerate(total_list):
                if mode == 'list':
                    li.append(self.filter_words(temp_list[idx], degree))
                elif mode == 'set':
                    li.append(self.set_with_order(self.filter_words(temp_list[idx], degree)))
                else:
                    print('Check mode')
                    return
            
            
        columns=['nav_list', 'noun_list', 'adj_list', 'verb_list']
        for i in  zip(columns, total_list):
            self.df[i[0]] = i[1]
            
        #return nav_list, noun_list, adj_list, verb_list # tuple(map(lambda x: [j.split('_')[0] for j in x], [nav_list, noun_list, adj_list, verb_list]))
    
    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    
    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list
 
    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited
        

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item,item) for item in i]

        del sub_book
        gc.collect()

        return dataset
    
    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
             dataset[n] = [i for i in line if i not in del_list]

        return dataset

    
    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
    
    def convert_list(self, *tokenized_list):
        input_length = len(tokenized_list)
        lists = [[] for i in range(input_length)]

        for idx, li in enumerate(tokenized_list):
            for j in li:
                lists[idx].append(['/'.join(j)])

        converted_array = np.array(lists[0])
        for idx in range(input_length):
            try:
                converted_array = np.concatenate((converted_array, lists[idx + 1]), axis=1)
            except Exception as e:
                print(e,'끝')

        return converted_array

    def make_df(self, start_array, converted_array, end_array, columns=['user_id', 'created_at', 'main_text', 'morph_list', 'nav_list', 'noun_list', 'adj_list', 'verb_list', 'hashtags', 'comments', 'likes', 'current_url']):         
        df = pd.DataFrame(np.hstack((start_array, converted_array, end_array)), index=None, columns=columns)
        return df
    
    # 키워드 리스트 중 하나라도 있는 경우
    def word_check_or(self, text, keywords):
        if any(word in text for word in keywords):
            return 1
        else: return 0

    # 키워드 리스트에 있는 단어가 모두 있는 경우
    def word_check_and(self, text, keywords):
        if all(word in text for word in keywords):
            return 1
        else:
            return 0


    def word_check(self, method, keywords, df, column_name = 'main_text',filter_TF=True):
        
        filter_TF = 1 if filter_TF == True else 0
        if method == 'and':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_and(x, keywords))
            return df.loc[df['flags'] == filter_TF]

        elif method == 'or':
            df['flags'] = df[column_name].apply(lambda x: self.word_check_or(x, keywords))
            return df.loc[df['flags'] == filter_TF]
        
        else:
            print('Select method, and/or')
            
    def df_str(df, *column_names):
        for i in column_names:
            df.loc[:,i] = df.loc[:,i].str.join('/')
        

In [17]:
log = Social_analysis('인테이크')

In [18]:
log.DB_to_table()

In [19]:
log.df

Unnamed: 0,LBip,CDNip,logip,logdate,before_url,after_url,device_info,device,os,app
0,172.30.0.104,54.239.154.112,1.11.158.144,2018-07-16 16:18:18,http://m.mail.daum.net/hanmailex/mobile/Top.da...,https://www.shopintake.com/event/view/139/?utm...,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
1,172.30.0.226,54.239.154.146,1.11.158.144,2018-07-16 16:19:05,https://www.shopintake.com/event/view/139/?utm...,https://www.shopintake.com/event/,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
2,172.30.0.104,54.239.154.112,1.11.158.144,2018-07-16 16:19:27,https://www.shopintake.com/event/,https://www.shopintake.com/event/?page=2,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
3,172.30.0.158,54.239.154.112,1.11.213.30,2018-07-23 14:23:28,https://m.search.naver.com/search.naver?where=...,https://www.shopintake.com/product/view/399/?u...,Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like...,Mobile,iPhone,Web
4,172.30.0.226,54.239.154.146,1.11.62.107,2018-07-26 07:38:24,https://search.naver.com/search.naver?where=ne...,https://www.shopintake.com/?utm_source=naver&u...,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
5,172.30.0.158,54.239.154.112,1.11.62.107,2018-07-26 07:39:15,https://search.naver.com/search.naver?where=ne...,https://www.shopintake.com/?utm_source=naver&u...,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
6,172.30.0.226,54.239.154.146,1.11.62.107,2018-07-26 07:39:53,https://www.shopintake.com/?utm_source=naver&u...,https://www.shopintake.com/category/mealschip/,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
7,172.30.0.226,54.239.154.146,1.11.98.63,2018-07-27 15:13:27,https://m.search.naver.com/search.naver?query=...,https://www.shopintake.com/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web
8,172.30.0.226,54.239.154.146,1.11.98.63,2018-07-27 15:13:38,https://www.shopintake.com/,https://www.shopintake.com/product/view/640/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web
9,172.30.0.158,54.239.154.112,1.11.98.63,2018-07-27 15:15:19,https://m.search.naver.com/search.naver?query=...,https://www.shopintake.com/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web


In [20]:
log.sort_by_ip()

In [21]:
log.ip_set

['58.224.58.84',
 '125.136.153.110',
 '210.222.60.153',
 '175.223.10.134',
 '117.111.16.78',
 '122.46.44.162',
 '211.114.177.245',
 '223.38.17.105',
 '218.158.41.143',
 '175.223.49.201',
 '61.82.91.38',
 '61.4.241.88',
 '14.63.37.131',
 '117.111.1.63',
 '180.68.32.62',
 '39.7.54.109',
 '223.62.10.122',
 '111.118.44.251',
 '210.116.31.234',
 '61.253.157.213',
 '110.70.55.193',
 '175.223.27.20',
 '175.201.245.135',
 '125.133.186.119',
 '210.218.196.72',
 '223.33.153.83',
 '211.218.254.184',
 '222.107.214.246',
 '203.217.242.130',
 '119.202.103.111',
 '112.171.83.193',
 '211.63.137.72',
 '223.62.10.243',
 '175.223.32.148',
 '46.229.168.68',
 '182.161.149.74',
 '121.163.21.205',
 '14.45.31.15',
 '124.60.191.136',
 '211.36.154.193',
 '121.128.123.112',
 '100.43.81.121',
 '124.53.144.52',
 '61.105.133.87',
 '42.114.33.241',
 '95.163.255.168',
 '66.249.79.19',
 '211.117.77.219',
 '112.171.118.207',
 '14.34.1.175',
 '211.248.181.135',
 '14.51.172.100',
 '211.36.150.77',
 '106.243.217.173',
 '1

In [32]:
log.ip_set[1]

'125.136.153.110'

In [69]:
out = log.df.query("after_url == 'https://www.shopintake.com/payment/complete/'")

In [70]:
listed_ip = list(out['logip'])

In [71]:
set_complete_ip = list(set(listed_ip))

In [72]:
set_complete_ip

['122.37.251.50',
 '59.6.176.58',
 '210.222.60.153',
 '121.139.62.108',
 '1.243.141.116',
 '175.223.19.223',
 '14.43.103.243',
 '112.171.9.231',
 '117.111.16.78',
 '1.224.31.94',
 '115.22.138.20',
 '175.117.163.184',
 '223.62.173.152',
 '175.223.48.217',
 '117.111.28.247',
 '223.39.130.19',
 '117.111.26.224',
 '61.4.241.88',
 '59.10.67.44',
 '219.240.64.28',
 '112.154.97.119',
 '1.214.136.26',
 '39.7.54.109',
 '118.221.173.96',
 '125.184.175.89',
 '59.23.84.22',
 '175.118.83.74',
 '175.223.27.20',
 '106.252.48.165',
 '175.223.17.190',
 '211.110.78.209',
 '220.77.48.182',
 '168.131.145.48',
 '117.123.245.254',
 '223.62.175.74',
 '221.150.245.170',
 '1.233.236.156',
 '211.106.189.112',
 '221.138.92.92',
 '223.62.216.241',
 '175.223.18.100',
 '61.75.74.163',
 '121.137.45.187',
 '220.85.106.67',
 '175.200.80.107',
 '175.223.10.214',
 '116.41.234.135',
 '218.149.7.203',
 '222.108.25.250',
 '211.201.194.177',
 '106.255.82.115',
 '218.153.165.171',
 '121.150.157.67',
 '118.32.181.93',
 '211.3

In [73]:
len(set_complete_ip)

1614

In [75]:
completed_logs=[]

In [83]:
for i in range(15):
    rows = log.df.query("logip == '{}'".format(set_complete_ip[i]))
    completed_logs.append(rows)
    print(rows[['after_url','logdate']])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            after_url  \
87341  https://www.shopintake.com/?utm_source=naver&utm_medium=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B2%80%EC%83%89&utm_campaign=NBS%EB%A9%94%EC%9D%B8%EC%9D%B4%EB%AF%B8%EC%A7%80_20180604&utm_content=M&inflow_tag=NBS&n_media=8753&n_query=%EB%AA%A8%EB%8B%9D%EC%A3%BD&n_rank=1&n_ad_group=grp-a001-04-000000006186847&n_ad=nad-a001-04-000000036826509&n_keyword_id=nkw-a001-04-000001153634751&n_keyword=%EB%AA%A8%EB%8B%9D%EC%A3%

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                after_url  \
37783  https://www.shopintake.com/?utm_source=naver&utm_medium=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B2%80%EC%83%89&utm_campaign=NBS%ED%83%80%EC%9D%B4%ED%8B%80_20180604&utm_content=PC&inflow_tag=NBS&n_media=27758&n_query=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC&n_rank=1&n_ad_group=grp-a001-04-000000006284014&n_ad=nad-a001-04-000000036826491&n_keyword_id=nkw-a001-04-000001156942984&n_keyword=%EC%9D%B8%ED%85%8C%EC%9D%B4%

TypeError: list indices must be integers or slices, not str

In [53]:
pd.options.display.max_colwidth = 2000

In [56]:
out

Unnamed: 0,LBip,CDNip,logip,logdate,before_url,after_url,device_info,device,os,app
144407,172.30.0.158,54.239.154.112,175.223.49.201,2018-07-24 08:04:42,https://m.search.naver.com/search.naver?query=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC+%EB%AA%A8%EB%8B%9D%EC%A3%BD&where=m&sm=mob_sug.idx&acq=%EB%AA%A8%EB%8B%9D%EC%A3%BD&acr=1&qdt=0,https://www.shopintake.com/?utm_source=naver&utm_medium=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B2%80%EC%83%89&utm_campaign=NBS%EB%A9%94%EC%9D%B8%ED%85%8D%EC%8A%A4%ED%8A%B8_20180604&utm_content=M&inflow_tag=NBS&n_media=8753&n_query=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC%EB%AA%A8%EB%8B%9D%EC%A3%BD&n_rank=1&n_ad_group=grp-a001-04-000000006186847&n_ad=nad-a001-04-000000036826509&n_keyword_id=nkw-a001-04-000001140871016&n_keyword=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC%EB%AA%A8%EB%8B%9D%EC%A3%BD&n_campaign_type=4&n_contract=tct-a001-04-000000000061657&NaPm=ct%3Djjzezrtc%7Cci%3D0x410e2oRjnptUdGJvpv%7Ctr%3Dbrnd%7Chk%3Df40a0c1262885ec8615aaf650ef1198cf39dc89d,"Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15G77 NAVER(inapp; search; 590; 8.8.3; 7)",Mobile,iPhone,Web
144408,172.30.0.226,54.239.154.146,175.223.49.201,2018-07-24 08:05:16,https://www.shopintake.com/?utm_source=naver&utm_medium=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B2%80%EC%83%89&utm_campaign=NBS%EB%A9%94%EC%9D%B8%ED%85%8D%EC%8A%A4%ED%8A%B8_20180604&utm_content=M&inflow_tag=NBS&n_media=8753&n_query=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC%EB%AA%A8%EB%8B%9D%EC%A3%BD&n_rank=1&n_ad_group=grp-a001-04-000000006186847&n_ad=nad-a001-04-000000036826509&n_keyword_id=nkw-a001-04-000001140871016&n_keyword=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC%EB%AA%A8%EB%8B%9D%EC%A3%BD&n_campaign_type=4&n_contract=tct-a001-04-000000000061657&NaPm=ct%3Djjzezrtc%7Cci%3D0x410e2oRjnptUdGJvpv%7Ctr%3Dbrnd%7Chk%3Df40a0c1262885ec8615aaf650ef1198cf39dc89d,https://www.shopintake.com/product_search/?keyword=%EB%AA%A8%EB%8B%9D%EC%A3%BD,"Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15G77 NAVER(inapp; search; 590; 8.8.3; 7)",Mobile,iPhone,Web
144409,172.30.0.158,54.239.154.112,175.223.49.201,2018-07-24 08:05:42,https://www.shopintake.com/product_search/?keyword=%EB%AA%A8%EB%8B%9D%EC%A3%BD,https://www.shopintake.com/product_search/?keyword=%EB%AA%A8%EB%8B%9D%EC%A3%BD&page=2,"Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15G77 NAVER(inapp; search; 590; 8.8.3; 7)",Mobile,iPhone,Web


In [49]:
len(out)

3

In [22]:
for i in range(10):
    log.df['logip']

In [11]:
log.df

Unnamed: 0,LBip,CDNip,logip,logdate,before_url,after_url,device_info,device,os,app
0,172.30.0.104,54.239.154.112,1.11.158.144,2018-07-16 16:18:18,http://m.mail.daum.net/hanmailex/mobile/Top.da...,https://www.shopintake.com/event/view/139/?utm...,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
1,172.30.0.226,54.239.154.146,1.11.158.144,2018-07-16 16:19:05,https://www.shopintake.com/event/view/139/?utm...,https://www.shopintake.com/event/,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
2,172.30.0.104,54.239.154.112,1.11.158.144,2018-07-16 16:19:27,https://www.shopintake.com/event/,https://www.shopintake.com/event/?page=2,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like M...,Mobile,iPhone,Web
3,172.30.0.158,54.239.154.112,1.11.213.30,2018-07-23 14:23:28,https://m.search.naver.com/search.naver?where=...,https://www.shopintake.com/product/view/399/?u...,Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like...,Mobile,iPhone,Web
4,172.30.0.226,54.239.154.146,1.11.62.107,2018-07-26 07:38:24,https://search.naver.com/search.naver?where=ne...,https://www.shopintake.com/?utm_source=naver&u...,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
5,172.30.0.158,54.239.154.112,1.11.62.107,2018-07-26 07:39:15,https://search.naver.com/search.naver?where=ne...,https://www.shopintake.com/?utm_source=naver&u...,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
6,172.30.0.226,54.239.154.146,1.11.62.107,2018-07-26 07:39:53,https://www.shopintake.com/?utm_source=naver&u...,https://www.shopintake.com/category/mealschip/,Mozilla/5.0 (Windows NT 6.2; Win64; x64) Apple...,PC,Windows,
7,172.30.0.226,54.239.154.146,1.11.98.63,2018-07-27 15:13:27,https://m.search.naver.com/search.naver?query=...,https://www.shopintake.com/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web
8,172.30.0.226,54.239.154.146,1.11.98.63,2018-07-27 15:13:38,https://www.shopintake.com/,https://www.shopintake.com/product/view/640/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web
9,172.30.0.158,54.239.154.112,1.11.98.63,2018-07-27 15:15:19,https://m.search.naver.com/search.naver?query=...,https://www.shopintake.com/,Mozilla/5.0 (Linux; Android 8.0; SM-G955N Buil...,Mobile,Android,Web


In [18]:
yj1 = Social_analysis('인테이크')
yj2 = Social_analysis('랩노쉬')

In [19]:
yj1.yj1_DB_to_table()
yj2.yj2_DB_to_table()

In [5]:
itkinsta = Social_analysis('인테이크')
itkshop = Social_analysis('인테이크')
itkblog = Social_analysis('인테이크')
lnsinsta = Social_analysis('랩노쉬')
lnsshop = Social_analysis('랩노쉬')
lnsblog = Social_analysis('랩노쉬')
pckinsta = Social_analysis('피코크')
pckshop = Social_analysis('피코크')
pckblog = Social_analysis('피코크')
kgcinsta = Social_analysis('정관장')
kgcshop = Social_analysis('정관장')
kgcblog = Social_analysis('정관장')


In [6]:
itkinsta.insta_DB_to_table(DBname = 'intake', keyword = 'intake')
itkshop.shop_DB_to_table('intake', 'intake')
itkblog.blog_DB_to_table('intake', 'intake')
lnsinsta.insta_DB_to_table(DBname = 'intake', keyword = 'labnosh')
lnsshop.shop_DB_to_table('intake', 'labnosh')
lnsblog.blog_DB_to_table('intake', 'labnosh')
pckinsta.pickle_to_table('Data/pck_list.txt')
pckshop.shop_DB_to_table('intake', 'pck')
pckblog.blog_DB_to_table('intake', 'pck')
kgcinsta.pickle_to_table('Data/kgc_list.txt')
kgcshop.shop_DB_to_table('intake', 'kgc')
kgcblog.blog_DB_to_table('intake', 'kgc')


In [7]:
itkinsta.df

Unnamed: 0,user_id,created_at,main_text,hashtags,comments,likes,current_url
0,invincible_skhyun,2018-01-24 11:11:08,invincible_skhyun#식단일기_180124\n.\n아침:인테이크 소이밀크...,식단일기_180124/인테이크/인테이크소이밀크/아임웰/아임웰굿밸런스라이스/아임닭/아...,,11,https://www.instagram.com/p/BeVH5d3luGZ/?hl=ko...
1,jiyu_lee,2018-01-24 08:37:57,jiyu_lee-\n요즘 먹고 있는 유산균.\n-\n인테이크에서 판매하는걸 보고 주...,인테이크/intake/휴먼바이오틱스/유산균/포장이예쁘니/효과도좋을거같고,,19,https://www.instagram.com/p/BeU2Xe0BWpD/?hl=ko...
2,dionycchus,2018-01-24 07:43:53,dionycchus#인테이크\n올해엔 건강해져볼까 합니다ㅎㅎ\n이러고 또 술처먹겠지...,인테이크,,11,https://www.instagram.com/p/BeUwLe9lTrg/?hl=ko...
3,invincible_skhyun,2018-01-23 12:55:54,invincible_skhyun#식단일기_180123\n\n다시 마음잡고 시작해보는...,식단일기_180123/인테이크/인테이크소이밀크/아임웰/아임웰라이트밀/아임닭/아임닭스...,,,https://www.instagram.com/p/BeSvF-eF6wk/?hl=ko...
4,diet.oneul,2018-01-23 12:09:54,diet.oneul#오늘의식단\n#저녁\n인테이크 모닝죽 단팥.\n냉동야채+닭쌤닭가...,오늘의식단/저녁/식단/식단일기/인테이크/모닝죽,,15,https://www.instagram.com/p/BeSp1AIl7tT/?hl=ko...
5,styleshare_beauty,2018-01-23 08:13:52,styleshare_beauty[#뷰티실험실 : 다이어트 핫템 4🔥]\n굶는 다이어...,뷰티실험실/인테이크/스타일쉐어/스쉐스토어/다이어트/대용식/식단조절,,,https://www.instagram.com/p/BeSOvOigMHJ/?hl=ko...
6,ddong940423,2018-01-23 07:59:04,ddong940423#intake #인테이크 #밀스3 #이벤트\n1.23 무료체험 ...,intake/인테이크/밀스3/이벤트/stopsilver425/hello.oioi/d...,,19,https://www.instagram.com/p/BeSNHz-jPT5/?hl=ko...
7,ssujjeong_,2018-01-23 07:32:40,ssujjeong_군것질을 절대 하지않기위해 다이어트쿠키를 샀당\n다욧하는사람들 많...,몸스터즈/다이어트쿠키/다이어트시작한닷/운동하는여자/운동하는남자/운동/헬스타그램/비키...,ssujjeong_.\n.\n.\n#운동하는여자#운동하는남자#운동#헬스타그램#비키니...,132,https://www.instagram.com/p/BeSKGh5jRet/?hl=ko...
8,dameulstudio,2018-01-23 05:54:14,dameulstudio모닝죽 우유🥛\n_\n_\nClient: 인테이크\nFoods...,foodstylesun/wonkyup/다믈스튜디오/푸드스튜디오/송파/인테이크/모닝죽...,rimrim0408행복한 하루 되세요!/baggieatsim so hungry ri...,71,https://www.instagram.com/p/BeR-1iJHwmH/?hl=ko...
9,catharen_,2018-01-23 00:22:17,catharen_:\n생명 유지 수단이라고나 할까\n\n#일상 #작업일기 #인테이크...,일상/작업일기/인테이크/두유/모닝스타그램/dailylife/dailywork/soy...,harumulgorae잘보고 가용!/tangbong_91잘보고 가요 소통해요 😎,48,https://www.instagram.com/p/BeRY2S6FDRW/?hl=ko...


In [46]:
itkinsta.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프','단호박죽','검은콩','아미노리커버리', '홍삼젤리스틱','칼로리컷','인테이크모닝죽','견과류바'])
itkshop.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프','단호박죽','검은콩','아미노리커버리', '홍삼젤리스틱','칼로리컷','인테이크모닝죽','견과류바'])
itkblog.del_list.extend(['독립운동', '현정','인테이크','밀스','모닝죽', '파워젤부스트', '식사대용', '밀스라이트', '고구마죽', '모닝귀리', '모닝죽단호박', '슈퍼바', '아미노리커버', '모닝죽꿀고구마', '밀스소이', '향신료', '모닝그래놀라', '휴먼바이오틱스A1', '밀스하프','단호박죽','검은콩','아미노리커버리', '홍삼젤리스틱','칼로리컷','인테이크모닝죽','견과류바'])

lnsinsta.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫','랩노쉬모닝죽', '올데이워트','뮤즐리','자색고구마','미숫가루','푸드바','플랫바','그레인'])
lnsshop.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫','랩노쉬모닝죽', '올데이워트','뮤즐리','자색고구마','미숫가루','푸드바','플랫바','그레인'])
lnsblog.del_list.extend(['랩노쉬', '푸드쉐이크', '미식당', '우바', '쇼콜라', '스타터키트', '올데이키트', '그래놀라', '랩노쉬플랫','랩노쉬모닝죽', '올데이워트','뮤즐리','자색고구마','미숫가루','푸드바','플랫바','그레인'])

pckinsta.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥','피코크','초마','호떡','포레스티','마몰','녹두','피코','피콕','성서','깍지','파베'])
pckshop.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥','피코크','초마','호떡','포레스티','마몰','녹두','피코','피콕','성서','깍지','파베'])
pckblog.del_list.extend(['노브랜드', '라자냐', '티라미수', '하노이', '빈대떡', '서주현','서현','계피','차돌박이','삼계탕','닭꼬치','볶음밥','피코크','초마','호떡','포레스티','마몰','녹두','피코','피콕','성서','깍지','파베'])

kgcinsta.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈','정관장홍삼','홍삼','홍삼정진','한포','삼정','기스트','서준희','송중기'])
kgcshop.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈','정관장홍삼','홍삼','홍삼정진','한포','삼정','기스트','서준희','송중기'])
kgcblog.del_list.extend(['에브리타임','정관장','홍삼정','정해인','정관','장홍','조정석','디페','전광렬','홍삼스틱','한국인삼공사','홍이장군','박은빈','정관장홍삼','홍삼','홍삼정진','한포','삼정','기스트','서준희','송중기'])




###lnsinsta.del_list.extend([''])###

In [8]:
itkinsta.df = itkinsta.word_check('or', ['자동차', '흡기', '배기','도어락'], itkinsta.df, 'hashtags', False)
itkinsta.df = itkinsta.word_check('or', ['intakefoods', 'dameulstudio', '_.ddo2', '__scarlett.k', '0.8l_korea', 'jiseung86', 'untactmarket'], itkinsta.df, 'user_id', False)
itkinsta.df = itkinsta.word_check('or', ['자동차', '흡기', '배기','도어락'], itkinsta.df, 'main_text', False)

lnsinsta.df = lnsinsta.word_check('or', ['atemshop.official'], lnsinsta.df, 'user_id', False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [46]:
itkinsta.load_dictionary()
itkshop.load_dictionary()
itkblog.load_dictionary()

lnsinsta.load_dictionary()
lnsshop.load_dictionary()
lnsblog.load_dictionary()

pckinsta.load_dictionary()
pckshop.load_dictionary()
pckblog.load_dictionary()

kgcinsta.load_dictionary()
kgcshop.load_dictionary()
kgcblog.load_dictionary()

conver

temp = df_new_intake.loc[df_new['main_text'].str.contains('맛') & df_new['main_text'].str.contains('모닝죽') & df_new['main_text'].str.contains('있')]['']
temp.describe()

In [20]:
yj1.morph_pos(yj1.df['main_text'])
yj2.morph_pos(yj2.df['main_text'])

[['이거_NP',
  '먹_VV',
  '음_ETN',
  '밥_NNG',
  '안_MAG',
  '먹_VV',
  '어도_EC',
  '되_VV',
  '는_ETM',
  '거_NNB',
  '그냥_MAG',
  '간식_NNG',
  '인가_VCP+EF',
  '랩노쉬_NNG',
  'ᆢ제가_UNKNOWN',
  '한_MM',
  '번_NNBC',
  '먹_VV',
  '어_EC',
  '보_VX',
  '겠_EP',
  '습니다_EF',
  '올해_NNG',
  '키_NNG',
  '로_JKB',
  '감량_NNG',
  '합니다_VV+EF',
  'ᆢ제발여_UNKNOWN',
  '고마_NNP',
  '웡_NNP',
  '감동_NNG',
  '목표_NNG',
  '토익_NNG',
  '강사_NNG',
  '미토_NNP',
  '미친_VV+ETM',
  '토익_NNG',
  '강사_NNG',
  '인강_NNP',
  '토익_NNG',
  '식단_NNG',
  '다이어트_NNG',
  '다_MAG',
  '요트_NNG',
  '선물_NNG',
  '올리브영_NNG',
  '영어_NNG',
  '강사_NNG',
  '토익_NNG',
  '커_VA+EC',
  '체중_NNG',
  '감량_NNG',
  '소원_NNG',
  '키로_XSV+ETN+JKB',
  '돌려_VV+EC',
  '놔_VX+EC',
  '내_NP+JKG',
  '몸_NNG',
  'ㅋㅋ_IC'],
 ['랩노쉬_NNG',
  '미래형_NNG',
  '식단_NNG',
  '이_VCP',
  '라고_EC',
  '하_VV',
  '는_ETM',
  '데_NNB',
  '여_VV',
  '튼_ETM',
  '지효_NNG',
  '가_JKS',
  '크리스마스_NNP',
  '선물_NNG',
  '로_JKB',
  '줘서_VV+EC',
  '오늘_NNG',
  '부터_JX',
  '내_NP+JKG',
  '저녁_NNG',
  '식사_NNG',
  '가_JKS',
  '될_VV+ETM',
  '예정_N

In [9]:
itkinsta.morph_pos(itkinsta.df['main_text'])

itkshop.morph_pos(itkshop.df['main_text'])

itkblog.morph_pos(itkblog.df['main_text'])

lnsinsta.morph_pos(lnsinsta.df['main_text'])

lnsshop.morph_pos(lnsshop.df['main_text'])

lnsblog.morph_pos(lnsblog.df['main_text'])

pckinsta.morph_pos(pckinsta.df['main_text'])

pckshop.morph_pos(pckshop.df['main_text'])

pckblog.morph_pos(pckblog.df['main_text'])

kgcinsta.morph_pos(kgcinsta.df['main_text'])

kgcshop.morph_pos(kgcshop.df['main_text'])

kgcblog.morph_pos(kgcblog.df['main_text'])


[['월드_NNG',
  '디제이_NNG',
  '페스티벌_NNG',
  '후기_NNG',
  '월_NNG',
  '디페_NNG',
  '다녀왔_VV+EP',
  '어요_EF',
  '정관장_NNG',
  '에브리타임_NNG',
  '먹_VV',
  '고_EC',
  '버_NNP',
  '닝_NNP',
  '내_NP+JKG',
  '생애_NNG',
  '첫_MM',
  '월드_NNG',
  '디제이_NNG',
  '페스티벌_NNG',
  '다른_MM',
  '축제_NNG',
  '는_JX',
  '많이_MAG',
  '가_VV+EC',
  '봤_VX+EP',
  '는데_EC',
  '월_NNG',
  '디페_NNG',
  '랑_JKB',
  '은_JX',
  '이상_NNG',
  '하_XSV',
  '게_EC',
  '인연_NNG',
  '이_JKS',
  '없_VA',
  '더라고요_EC',
  'ㅠㅠ_UNKNOWN',
  '올해_NNG',
  '는_JX',
  '드디어_MAG',
  '다녀왔_VV+EP',
  '어요_EF',
  '저_NP',
  '는_JX',
  '일요_NNG',
  '일_NR',
  '원_NNBC',
  '데이_NNG',
  '권_XSN',
  '으로_JKB',
  '다녀왔_VV+EP',
  '는데_EC',
  '토요일_NNG',
  '은_JX',
  '많이_MAG',
  '더웠_VA+EP',
  '다고_EC',
  '하_VV',
  '더라고요_EC',
  '일요일_NNG',
  '선택_NNG',
  '한_XSA+ETM',
  '거_NNB',
  '아주_MAG',
  '칭찬_NNG',
  '해_XSV+EC',
  '잠실_NNP',
  '종합_NNG',
  '운동장_NNG',
  '서울특별시_NNP',
  '송파구_NNP',
  '올림픽로_NNP',
  '서울_NNP',
  '종합_NNG',
  '운동장_NNG',
  '지도_NNG',
  '보_VV',
  '기_ETN',
  '우리_NP',
  '집_NNG',
  '에서_JKB',
  '

In [9]:
import pickle

In [12]:
ins = [itkinsta.df,itkshop.df,itkblog.df, lnsinsta.df, lnsshop.df, lnsblog.df, pckinsta.df, pckshop.df, pckblog.df, kgcinsta.df, kgcshop.df, kgcblog.df]
with open('instances.txt', 'wb') as f:
    pickle.dump(ins, f)

In [21]:
yj1.pos_extractor(yj1.df['morph_list'], 'list','theme')
yj2.pos_extractor(yj2.df['morph_list'], 'list','theme')

In [10]:
itkinsta.pos_extractor(itkinsta.df['morph_list'], 'list', 'theme')
itkshop.pos_extractor(itkshop.df['morph_list'], 'list', 'theme')
itkblog.pos_extractor(itkblog.df['morph_list'], 'list', 'theme')

lnsinsta.pos_extractor(lnsinsta.df['morph_list'], 'list', 'theme')
lnsshop.pos_extractor(lnsshop.df['morph_list'], 'list', 'theme')
lnsblog.pos_extractor(lnsblog.df['morph_list'], 'list', 'theme')

pckinsta.pos_extractor(pckinsta.df['morph_list'],'list', 'theme')
pckshop.pos_extractor(pckshop.df['morph_list'], 'list', 'theme')
pckblog.pos_extractor(pckblog.df['morph_list'], 'list', 'theme')

kgcinsta.pos_extractor(kgcinsta.df['morph_list'], 'list', 'theme')
kgcshop.pos_extractor(kgcshop.df['morph_list'], 'list', 'theme')
kgcblog.pos_extractor(kgcblog.df['morph_list'], 'list', 'theme')

In [11]:
itkinsta.df

Unnamed: 0,user_id,created_at,main_text,hashtags,comments,likes,current_url,flags,morph_list,nav_list,noun_list,adj_list,verb_list
0,invincible_skhyun,2018-01-24 11:11:08,invincible_skhyun#식단일기_180124\n.\n아침:인테이크 소이밀크...,식단일기_180124/인테이크/인테이크소이밀크/아임웰/아임웰굿밸런스라이스/아임닭/아...,,11,https://www.instagram.com/p/BeVH5d3luGZ/?hl=ko...,0,"[식단_NNG, 일기_NNG, 아침_NNG, 인테이크_NNG, 소이_NNG, 밀크_...","[식단, 일기, 아침, 인테이크, 소이, 밀크, 블랙, 점심, 아임, 굿, 밸런스,...","[식단, 일기, 아침, 인테이크, 소이, 밀크, 블랙, 점심, 아임, 굿, 밸런스,...",[],[]
1,jiyu_lee,2018-01-24 08:37:57,jiyu_lee-\n요즘 먹고 있는 유산균.\n-\n인테이크에서 판매하는걸 보고 주...,인테이크/intake/휴먼바이오틱스/유산균/포장이예쁘니/효과도좋을거같고,,19,https://www.instagram.com/p/BeU2Xe0BWpD/?hl=ko...,0,"[요즘_NNG, 먹_VV, 고_EC, 있_VX, 는_ETM, 유산균_NNG, 인테이...","[요즘, 유산균, 인테이크, 판매, 주문, 인테이크, 밀스, 괜찮, 보통, 유산균,...","[요즘, 유산균, 인테이크, 판매, 주문, 인테이크, 밀스, 보통, 유산균, 알약,...","[흔들, 마시, 따라서, 흔들, 마시, 흔들, 마셔야, 바뀌, 모르]","[괜찮, 귀찮, 괜찮, 예쁘]"
2,dionycchus,2018-01-24 07:43:53,dionycchus#인테이크\n올해엔 건강해져볼까 합니다ㅎㅎ\n이러고 또 술처먹겠지...,인테이크,,11,https://www.instagram.com/p/BeUwLe9lTrg/?hl=ko...,0,"[인테이크_NNG, 올해_NNG, 엔_JKB+JX, 건강_NNG, 해져_XSA+EC...","[인테이크, 올해, 건강, 이러, 처먹]","[인테이크, 올해, 건강]","[이러, 처먹]",[]
3,invincible_skhyun,2018-01-23 12:55:54,invincible_skhyun#식단일기_180123\n\n다시 마음잡고 시작해보는...,식단일기_180123/인테이크/인테이크소이밀크/아임웰/아임웰라이트밀/아임닭/아임닭스...,,,https://www.instagram.com/p/BeSvF-eF6wk/?hl=ko...,0,"[식단_NNG, 일기_NNG, 다시_MAG, 마음잡_VV, 고_EC, 시작_NNG,...","[식단, 일기, 마음잡, 시작, 식단, 일기, 아침, 인테이크, 두유, 점심, 아임...","[식단, 일기, 시작, 식단, 일기, 아침, 인테이크, 두유, 점심, 아임, 라이트...",[마음잡],[재미있]
4,diet.oneul,2018-01-23 12:09:54,diet.oneul#오늘의식단\n#저녁\n인테이크 모닝죽 단팥.\n냉동야채+닭쌤닭가...,오늘의식단/저녁/식단/식단일기/인테이크/모닝죽,,15,https://www.instagram.com/p/BeSp1AIl7tT/?hl=ko...,0,"[오늘_NNG, 의_JKG, 식단_NNG, 저녁_NNG, 인테이크_NNG, 모닝죽단...","[오늘, 식단, 저녁, 인테이크, 모닝죽, 냉동야채, 가슴살, 소세지, 식단, 식단...","[오늘, 식단, 저녁, 인테이크, 모닝죽, 냉동야채, 가슴살, 소세지, 식단, 식단...",[],[]
5,styleshare_beauty,2018-01-23 08:13:52,styleshare_beauty[#뷰티실험실 : 다이어트 핫템 4🔥]\n굶는 다이어...,뷰티실험실/인테이크/스타일쉐어/스쉐스토어/다이어트/대용식/식단조절,,,https://www.instagram.com/p/BeSOvOigMHJ/?hl=ko...,0,"[뷰티_NNG, 실험실_NNG, 다이어트_NNG, 핫_NNG, 템_NNG, 굶_VV...","[뷰티, 실험실, 다이어트, 다이어트, 건강, 다이어트, 도와, 인테이크, 가지, ...","[뷰티, 실험실, 다이어트, 다이어트, 건강, 다이어트, 인테이크, 가지, 이젠, ...",[도와],[]
6,ddong940423,2018-01-23 07:59:04,ddong940423#intake #인테이크 #밀스3 #이벤트\n1.23 무료체험 ...,intake/인테이크/밀스3/이벤트/stopsilver425/hello.oioi/d...,,19,https://www.instagram.com/p/BeSNHz-jPT5/?hl=ko...,0,"[인테이크_NNG, 밀스_NNP, 이벤트_NNG, 무료_NNG, 체험_NNG, 이벤...","[인테이크, 밀스, 이벤트, 무료, 체험, 이벤트, 아침, 필요, 없]","[인테이크, 밀스, 이벤트, 무료, 체험, 이벤트, 아침, 필요]",[],[없]
7,ssujjeong_,2018-01-23 07:32:40,ssujjeong_군것질을 절대 하지않기위해 다이어트쿠키를 샀당\n다욧하는사람들 많...,몸스터즈/다이어트쿠키/다이어트시작한닷/운동하는여자/운동하는남자/운동/헬스타그램/비키...,ssujjeong_.\n.\n.\n#운동하는여자#운동하는남자#운동#헬스타그램#비키니...,132,https://www.instagram.com/p/BeSKGh5jRet/?hl=ko...,0,"[군것질_NNG, 을_JKO, 절대_MAG, 하_VV, 지_EC, 않_VX, 기_E...","[군것질, 위해, 다이어트, 쿠키, 사람, 박스, 리아, 챙겨, 보내, 마시, 몸,...","[군것질, 다이어트, 쿠키, 사람, 박스, 리아, 마시, 몸, 스터, 다이어트, 쿠...","[위해, 챙겨, 보내]",[]
9,catharen_,2018-01-23 00:22:17,catharen_:\n생명 유지 수단이라고나 할까\n\n#일상 #작업일기 #인테이크...,일상/작업일기/인테이크/두유/모닝스타그램/dailylife/dailywork/soy...,harumulgorae잘보고 가용!/tangbong_91잘보고 가요 소통해요 😎,48,https://www.instagram.com/p/BeRY2S6FDRW/?hl=ko...,0,"[생명_NNG, 유지_NNG, 수단_NNG, 이_VCP, 라고_EC, 나_JX, 할...","[생명, 유지, 수단, 할까, 일상, 작업, 일기, 인테이크, 두유, 모닝, 스타그램]","[생명, 유지, 수단, 일상, 작업, 일기, 인테이크, 두유, 모닝, 스타그램]",[할까],[]
10,lora.study,2018-07-03 23:57:47,lora.study#아침식사#죽#모닝죽#단호박#인테이크\n.\n몇년째 먹는 모닝죽\...,아침식사/죽/모닝죽/단호박/인테이크/맛스타그램/lfl/foodporn/먹스타그램/식...,lora.study#맛스타그램#lfl#foodporn#먹스타그램#식사대용#다이어트#...,163,https://www.instagram.com/p/Bkye0hsFHJB/?hl=ko...,0,"[아침_NNG, 식사_NNG, 죽_NNG, 모닝죽단호박_NNG, 인테이크_NNG, ...","[아침, 식사, 모닝죽, 인테이크, 모닝죽, 아침, 바쁘, 입맛, 없, 아침, 맛,...","[아침, 식사, 모닝죽, 인테이크, 모닝죽, 아침, 입맛, 아침, 맛, 종류, 모닝...","[고르, 나온]","[바쁘, 없, 있, 맛있, 맛있]"


In [25]:
print(itkinsta.df.loc[4,'main_text'])

diet.oneul#오늘의식단
#저녁
인테이크 모닝죽 단팥.
냉동야채+닭쌤닭가슴살소세지.
#식단 #식단일기 #인테이크 #모닝죽


In [26]:
print(itkinsta.df.loc[4,'morph_list'])

['오늘_NNG', '의_JKG', '식단_NNG', '저녁_NNG', '인테이크_NNG', '모닝죽단팥_NNG', '냉동야채_NNG', '닭_NNG', '쌤_NNG', '닭_NNG', '가슴살_NNG', '소세지_NNG', '식단_NNG', '식단_NNG', '일기_NNG', '인테이크_NNG', '모닝죽_NNG']


In [32]:
yj1.nav_merged = yj1.merge_list(yj1.df['nav_list'])
yj2.nav_merged = yj2.merge_list(yj2.df['nav_list'])

In [33]:
yj2.nav_merged

['밥',
 '간식',
 '랩노쉬',
 '올해',
 '감량',
 '합니다',
 '고마',
 '감동',
 '목표',
 '토익',
 '강사',
 '미토',
 '미친',
 '토익',
 '강사',
 '인강',
 '토익',
 '식단',
 '다이어트',
 '요트',
 '선물',
 '올리브영',
 '영어',
 '강사',
 '토익',
 '체중',
 '감량',
 '소원',
 '돌려',
 '몸',
 '랩노쉬',
 '미래형',
 '식단',
 '지효',
 '크리스마스',
 '선물',
 '줘서',
 '오늘',
 '저녁',
 '식사',
 '예정',
 '스타트',
 '베이직',
 '생각',
 '밀크티',
 '맛',
 '지효',
 '생유',
 '먹방',
 '죄책감',
 '먹스타그램',
 '선물',
 '선물',
 '스타그램',
 '크리스마스',
 '선물',
 '저녁',
 '다이어트',
 '랩노쉬',
 '베이직',
 '랩노쉬',
 '자색고구마',
 '랩노쉬',
 '베이직',
 '랩노쉬',
 '베이직',
 '랩노쉬',
 '베이직',
 '데이',
 '출근',
 '배고파서',
 '선물',
 '감사',
 '랩노쉬',
 '송희',
 '친구',
 '선물',
 '랩노쉬',
 '해서',
 '배불러',
 '식사대용',
 '미녀',
 '선물',
 '아침',
 '식사',
 '랩노쉬',
 '베이직',
 '엘큐',
 '가로수',
 '길점',
 '방문',
 '후기',
 '큐브',
 '가로수길',
 '오픈',
 '지나가',
 '들러서',
 '구경',
 '플라잉',
 '타이거',
 '있',
 '요즘',
 '나오',
 '없',
 '지나치',
 '아쉽',
 '회사',
 '이어폰',
 '구입',
 '블리',
 '제이',
 '나인',
 '소호',
 '들어와',
 '챔피온',
 '매장',
 '봐도',
 '추억',
 '챔피온',
 '세트',
 '양말',
 '고딩',
 '생각',
 '칙스',
 '디자이너',
 '브랜드',
 '랩노쉬',
 '들어와',
 '야근',
 '애용',
 '상품',
 '나온',
 '맛',
 '구매',
 '구매'

In [12]:
itkinsta.nav_merged = itkinsta.merge_list(itkinsta.df['nav_list'])
itkshop.nav_merged = itkshop.merge_list(itkshop.df['nav_list'])
itkblog.nav_merged = itkblog.merge_list(itkblog.df['nav_list'])

lnsinsta.nav_merged = lnsinsta.merge_list(lnsinsta.df['nav_list'])
lnsshop.nav_merged = lnsshop.merge_list(lnsshop.df['nav_list'])
lnsblog.nav_merged = lnsblog.merge_list(lnsblog.df['nav_list'])

pckinsta.nav_merged = pckinsta.merge_list(pckinsta.df['nav_list'])
pckshop.nav_merged = pckshop.merge_list(pckshop.df['nav_list'])
pckblog.nav_merged = pckblog.merge_list(pckblog.df['nav_list'])

kgcinsta.nav_merged = kgcinsta.merge_list(kgcinsta.df['nav_list'])
kgcshop.nav_merged = kgcshop.merge_list(kgcshop.df['nav_list'])
kgcblog.nav_merged = kgcblog.merge_list(kgcblog.df['nav_list'])


In [13]:
itkinsta.nav_merged

['식단',
 '일기',
 '아침',
 '인테이크',
 '소이',
 '밀크',
 '블랙',
 '점심',
 '아임',
 '굿',
 '밸런스',
 '라이스',
 '레드',
 '갈릭',
 '가슴살',
 '아임',
 '훈제',
 '가슴살',
 '채소',
 '믹스',
 '브로콜리',
 '아임',
 '프랑크',
 '저녁',
 '곤약',
 '파스타',
 '그릴',
 '가슴살',
 '사태',
 '수육',
 '운동',
 '낸시',
 '트레이닝',
 '점심',
 '굿',
 '밸런스',
 '라이스',
 '가슴살',
 '채소',
 '저녁',
 '오뚜기',
 '프레스코',
 '스파게티',
 '소스',
 '채소',
 '믹스',
 '토마토',
 '곤약',
 '인테이크',
 '인테이크',
 '소이',
 '밀크',
 '아임',
 '아임',
 '굿',
 '밸런스',
 '라이스',
 '아임',
 '아임',
 '훈제',
 '가슴살',
 '아임',
 '소시지',
 '곤약',
 '파스타',
 '그릴',
 '가슴살',
 '요즘',
 '유산균',
 '인테이크',
 '판매',
 '주문',
 '인테이크',
 '밀스',
 '괜찮',
 '보통',
 '유산균',
 '알약',
 '개별',
 '포장',
 '가루',
 '물',
 '흔들',
 '마시',
 '타입',
 '포장',
 '마음',
 '처음',
 '물',
 '따라서',
 '흔들',
 '마시',
 '용기',
 '과정',
 '귀찮',
 '분유',
 '맛',
 '맛',
 '마지막',
 '단게',
 '설탕',
 '알갱이',
 '바닥',
 '흔들',
 '마셔야',
 '바뀌',
 '유산균',
 '덕분',
 '모르',
 '화장실',
 '달',
 '괜찮',
 '장복',
 '걸로',
 '인테이크',
 '유산균',
 '유산균',
 '포장',
 '예쁘',
 '효과',
 '인테이크',
 '올해',
 '건강',
 '이러',
 '처먹',
 '식단',
 '일기',
 '마음잡',
 '시작',
 '식단',
 '일기',
 '아침',
 '인테이크',
 '두유',
 '점심',
 '아임',
 '라

In [16]:
itkblog.df['current_url']

0                             https://blog.naver.com/zziyom77/221318741302
1            https://blog.naver.com/ynana1?Redirect=Log&logNo=221295485296
2                           https://blog.naver.com/hansol4511/221317680824
3                            https://blog.naver.com/llhoney_s/221299054824
4                                https://blog.naver.com/mys2k/221250623311
5                             https://blog.naver.com/ckh07301/221294848204
6                             https://blog.naver.com/shop2930/221297043395
7                         https://blog.naver.com/lululunanana/221299115359
8                            https://blog.naver.com/leesu0218/221302062758
9                               https://blog.naver.com/ynana1/221310303563
10                             https://blog.naver.com/rafiuta/221315560764
11                           https://blog.naver.com/nihaoyoga/221301879515
12                            https://blog.naver.com/luvtasha/221306675274
13                       

In [15]:
lnsblog.df['current_url']

0                   https://blog.naver.com/li_ji_closet/221268964672
1                   https://blog.naver.com/qkrdmswl0421/221301279473
2                   https://blog.naver.com/happy_bolbol/220870990799
3                     https://blog.naver.com/toptop1235/221268201393
4                     https://blog.naver.com/winkhe0208/221279712055
5                       https://blog.naver.com/assarose/221105491707
6                     https://blog.naver.com/dreamsumin/221308110221
7                       https://blog.naver.com/sang-_-v/221265091206
8                     https://blog.naver.com/tngustpwns/221252942457
9                     https://blog.naver.com/dpcyxl1514/221046517361
10                       https://blog.naver.com/plan917/221303475794
11                    https://blog.naver.com/ohsuna7777/221265704530
12                      https://blog.naver.com/oes35oes/220953228201
13                    https://blog.naver.com/dltjwl1417/221306772906
14                         https:/

In [13]:
'독립운동' in itkinsta.nav_merged

In [14]:
'모닝죽' in itkinsta.nav_merged

In [63]:
pd.options.display.max_colwidth = 2000

In [14]:
itk_merged = itkinsta.nav_merged + itkshop.nav_merged + itkblog.nav_merged
lns_merged = lnsinsta.nav_merged + lnsshop.nav_merged + lnsblog.nav_merged
pck_merged = pckinsta.nav_merged + pckshop.nav_merged + pckblog.nav_merged
kgc_merged = kgcinsta.nav_merged + kgcshop.nav_merged + kgcblog.nav_merged

In [34]:
tfidf = SB_Tfidf([yj1.nav_merged , yj2.nav_merged])
tfidf.get_tfidf()
tfidf_of_all = tfidf.tfidf_hangul

for i in tfidf_of_all:
    pprint(i[:20])
    print()

[('인테이크', 0.8982288008939535),
 ('홍삼', 0.22617271964955662),
 ('젤리', 0.16801402031109922),
 ('향신료', 0.12924155408546092),
 ('견과', 0.08400701015554961),
 ('홍삼젤리스틱', 0.07431389359914004),
 ('부모', 0.06462077704273046),
 ('포토', 0.06462077704273046),
 ('안주', 0.058158699338457416),
 ('샐러드', 0.054927660486320896),
 ('집들이', 0.054927660486320896),
 ('유산균', 0.05169662163418437),
 ('개선', 0.04846558278204785),
 ('메뉴', 0.04846558278204785),
 ('스틱', 0.045234543929911324),
 ('그래퍼', 0.042003505077774804),
 ('봉지', 0.042003505077774804),
 ('하루시리즈', 0.042003505077774804),
 ('브런치', 0.03554142737350176),
 ('아미노리커버리', 0.03554142737350176)]

[('책상위작은밭', 0.6455040154655234),
 ('영추', 0.18857420676520908),
 ('영행', 0.1740684985525007),
 ('미식당', 0.1668156444461465),
 ('리듬', 0.1595627903397923),
 ('챕스틱', 0.1595627903397923),
 ('카밀', 0.1595627903397923),
 ('기프트', 0.1450570821270839),
 ('플랫바', 0.13055137391437552),
 ('푸드쉐이크', 0.12329851980802133),
 ('랩노쉬플랫', 0.11604566570166713),
 ('이크', 0.11604566570166713),
 ('수강'

In [41]:
y = 0 
for idx in yj1.df.index:
    if '홍삼' in yj1.df.loc[idx, 'nav_list']:
        print(yj1.df.loc[idx,'main_text'])
        print('-----------')
        y+=1
    else:
        pass
print(y)

happysijoo-
야근 하고 집에 오니 선물이 🎀🎀
무슨 맛이 젤 궁금하세요?
저는 꿀홍삼이 젤 맛있을 것 같아요!
종류별로 챙겨갈께요 우리 나눠먹어요🙈❤️
-
파워젤 부스트(🍊,🍇)
운동 전 웜업을 도와주고, 운동 중 에너지와 활력을 주는 제품이에요! 저도 10k 이상 대회 나갈 땐 꼭 1개씩 먹어요:)
풀 마라톤 뛸 땐 4개 먹었어요 😁
-
아미노리커버(🍋,🐝)
운동으로 소모 해버린 에너지들을 보충해주고
몸에 쉬게끔 도와줘요
-
얼른 토요일이 왔으면 좋겠어요!!🙈❤️
벌써 반 이상 신청 하셨던데!!
아직 마감 전이니 많이 많이 신청해주세요
-
협찬해 주신 인테이크 정말 정말 감사드립니다🙏🏻
-
#런자매#자매런#런스타그램#인스타그램#인테이크#프립#파워젤부스트#아미노리커버#셀스타그램#얼스타그램#일상#셀피#운동녀#운동하는여자#운동#다요트#러닝#러너#협찬#running#powergel#diet#runner#l4l#selfie
-----------
momsterz_kr12월16일까지 진행하는
#신타를샀는데홍삼이왔어요 이벤트♥️
#신타6엣지 (사이즈무관) 구매하시는 분들께 #인테이크 #홍삼젤리스틱 30개입을 선물합니다!
보충제로 보충하고 홍삼으로 튼튼해집시다👍🏻
.
문의는 디엠이나 카톡 옐로우아이디 “몸스터즈”
www.momsterz.kr
.
.
#신타 #신타6 #신타6에지 #홍삼젤리스틱 #힘내 #홍삼젤리 #젤리스틱 #몸스터즈 #몸매스타그램 #다이어트 #보충제 #헬스보충제 #단백질보충제 #프로틴 #건강 #홍삼 #솟아라힘
-----------
k09184131#INTAKE#인테이크#모닝죽#뜻바께선물#휴먼바이오틱스#이그므지?#근대좋다.

겁나추운오늘.
코도얼고 귀도얼고 밤12시에 집에 쥐새퀴맹키로
뛰어들어와 오빠야! 한마뒤햇는데 "또 뭐삿노!" "산거없는디 이그므지?@-@;; 얼마전 영감맥이려고산 모닝죽 (검은콩내가묵고있음)
죽판매업체에서 이벤트를 하는구만 3마농치사묜 랜덤선물줌🎁

뜻바께 선물이지만 기분조쿠룡😉

휴먼바이오틱스라는디 인체유래 유산균이라함
집에오니 랜

In [52]:
tfidf = SB_Tfidf([itk_merged , lns_merged])
tfidf.get_tfidf()
tfidf_of_all = tfidf.tfidf_hangul

for i in tfidf_of_all:
    pprint(i[:20])
    print()

[('프립', 0.22406554605876966),
 ('컴포트', 0.2210376332741917),
 ('슈퍼스무디', 0.21800972048961373),
 ('카페다이어트', 0.19681433099756795),
 ('조미료', 0.19075850542841202),
 ('홍삼구미', 0.1816747670746781),
 ('아해', 0.16653520315178827),
 ('귀리죽', 0.15139563922889843),
 ('모닝바게트', 0.15139563922889843),
 ('시스', 0.14836772644432045),
 ('소백산', 0.1453398136597425),
 ('정모', 0.1453398136597425),
 ('아카시아', 0.1332281625214306),
 ('스크루', 0.13020024973685265),
 ('산행', 0.12717233695227467),
 ('퍼드', 0.12414442416769671),
 ('홍경', 0.12414442416769671),
 ('지미', 0.10900486024480686),
 ('품질', 0.10900486024480686),
 ('시스선', 0.10294903467565093)]

[('오늘의건강', 0.41618124468282736),
 ('위크', 0.31643532653570344),
 ('럭키', 0.2957982400225054),
 ('모링', 0.2545240669961093),
 ('쉬허', 0.18573377861878246),
 ('자정', 0.1547781488489854),
 ('리듬', 0.10318543256599025),
 ('라임', 0.09974591814712391),
 ('엔조', 0.09974591814712391),
 ('미슐랭', 0.09630640372825756),
 ('영행', 0.09630640372825756),
 ('이그니스', 0.08942737489052488),
 ('향수', 0.08942737489

In [16]:
len(lnsblog.df)

388

In [17]:
len(itkblog.df)

473

In [72]:
y = 0 
for idx in itkinsta.df.index:
    if '가족' in itkinsta.df.loc[idx, 'nav_list']:
        print (itkinsta.df.loc[idx,'current_url'])
        print('-----------')
        y+=1
    else:
        pass
print(y)

https://www.instagram.com/p/BkeWwHenajC/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BjmyK70Ae6L/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BjNaDQUAP2o/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BidvzualY1c/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BiBHJuMF6H_/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/Bg8ms4UgbD9/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BfX62ftHdT0/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/Bej1spCh5Fx/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BePt-i3lPJ5/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BdhwE_AnmFJ/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%

In [15]:
for idx in itkinsta.df.index:
    if '선물' in itkinsta.df.loc[idx, 'nav_list']:
        if '모닝죽' in itkinsta.df.loc[idx, 'nav_list']:
            print (itkinsta.df.loc[idx,'current_url'])
            print('-----------')
    else:
        pass

https://www.instagram.com/p/BkPiLiDhrtJ/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BjrpFrTlES6/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/Bg6Z79DlJRr/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/Bgkjjuhnhe-/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/Bcrj6asFS_P/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BboMtE1nT_l/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BZ2vCn9DOLH/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BZdBNkmHHMJ/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BYKvDTPjWQX/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%EC%9D%B4%ED%81%AC
-----------
https://www.instagram.com/p/BXN0cz3FmR3/?hl=ko&tagged=%EC%9D%B8%ED%85%8C%

In [97]:
itkshop.meals = []
itkshop.mealsmain=[]
itkshop.morning =[]
itkshop.morningmain=[]
itkshop.doctorNuts =[]
itkshop.doctorNutsmain =[]

In [98]:
for idx in itkshop.df.index:
    if itkshop.df.loc[idx, 'productCode'] in range(11200, 11307):
        itkshop.meals.append(itkshop.df.loc[idx,'nav_list'])
        itkshop.mealsmain.append(itkshop.df.loc[idx,'main_text'])
    elif itkshop.df.loc[idx, 'productCode'] in range(12100,12116):
        itkshop.morning.append(itkshop.df.loc[idx,'nav_list'])
        itkshop.morningmain.append(itkshop.df.loc[idx,'main_text'])
    elif itkshop.df.loc[idx, 'productCode'] in range(14101,14103):
        itkshop.doctorNuts.append(itkshop.df.loc[idx,'nav_list'])
        itkshop.doctorNutsmain.append(itkshop.df.loc[idx,'main_text'])
    else:
        pass

In [99]:
itkshop.mealsmain

['주변에서 자주 이야기 들어서 한번 사봤는데 좋네요',
 '항상 즐겨구매하는 상품입니다.',
 '뚜껑이 손힘으로 절대 안열려요. 매번 커터칼로열어요.',
 '씨앗이 좀 거슬렷지만 맛 좋아요',
 '양이 다른 것보다 조금 적은 것 같고 알갱이같은게 있는데 저는 원래 맛이 나은것 같아요',
 '고소 맛있음 긋긋긋',
 '이런 대체식품류 중에서 제일 맛있어요',
 '고소하고 오독오독 씹이는게 맛있어요^^',
 '너무 맛있어서 또 주문했습니다.',
 '밀스 2.0이 좋았었는데 이제 단종인가여.',
 '기대를안하고 주문했어요..받자마자 우유에 타서마셨는데 한끼로 충분하네요..\r\n고소하고 단맛까지있어  꿀.설탕은 필요없어요\r\n단점이있다면..걸죽하게타서 떠먹는것이아니고 물.우유같은데 타서 마시려니  건더기때문에 목넘김이  불편합니다  그것외에 건강해질것같긴한데 목넘김이 불편해요 건과류(건더기)꼭꼭씹으면고소한데 \r\n전 갠적으론 다~~~가루로했음바래보내요..',
 '사진 간단하게 휴대하고 타먹을수있어서 좋아요\r\n매우편하네요 맛은 좀 적응이 안되는데\r\n보충제보단 맛있어요',
 '사진 하나씩 담아서오다니..편하다고해야하나.....',
 '간편해서 세일 할때 재주문',
 '사진 좋은데요? 간지나고?ㅋㅋ\r\n제 동생은 비록 허세템샀다고 하지만 이게 왜 허세?  ㅡ.ㅡ',
 '맛과 식감이 취향에 안 맞아 또 사먹지는 않을 것 같네요.',
 '타사 제품을 먹어보고,비슷한 제품 검색해서 주문 했는데 맛도 좋고, 칼로리도 낮아서, 다이어트나 아침 한끼로 마시기에 너무 좋네요~^^',
 '맛있어요~ 맘에들어요',
 '물에 잘 녹아서 좋아요~~',
 '굿굿. 간편하면서도 배가 불러요',
 '밀스 중에서도 맛있어여~ 들어간 내용물도 많고 좋아요',
 '양이 너무많아요.. 한번에먹기는 부담스럽습니다',
 '밀스는 맛은 괜찮은데 자꾸 패키지를 바꾸면서 가격이 올라서 별로예요',
 '좋아요!! 다먹었어요!\r\n\r\n미숫가루 같이 맛있는데....\r\n\r\n전그냥 밥을 먹으려

In [101]:
for x in itkshop.morningmain:
    if '생각' in x:
        print(x)
        print('---------')

생각보다 사이즈가 작네요
---------
간편하게 먹기좋아요. 생각보다 맛이 진해요
---------
고구마 생각보다 안 달아요. 개인적으로 이런류 죽은 단게 좋아서 단팥죽>단호박>고구마 순으로 좋았어요. 목 넘김도 단팥죽이 제일 좋았어요. 제가 밥을 많이 먹는 편이라서 양은 다이어트 용 같은 느낌이었어요.
---------
맛있긴한데ㅡ생각보다 달아요 양이 적어요
---------
맛있긴한데ㅡ생각보다 달아요 양이 적어요
---------
와 정말 건강식이네요
제가 생각했던 맛은 전혀아닙니다 간편히먹을수있다는것말곤 좋은게 없네요 제가 몸에좋은건 안먹어서 그런지 렌지에 15초 돌리고 먹을수있는건 좋은데 먹을때 포장지 데워진냄새?가 나는것같네요 맛도 너무 자연적인것같아서 저는 별로입니다만 자극적인거 안좋아하시는분들은 좋아하실수도있겠어요
---------
단호박/고구마 둘다 먹어봤습니다.
전 단호박도 좋았지만 고구마가 생각보다 맛있네요, 담엔 좀 더 많이 시켜야 할것 같아요
---------
고구마는 생각보다 별로라고 하네요... 딸아이가 먹고싶다고 구입했는데 더이상 안먹겠다고 해서 나머지는 제가 먹어야 할드슈ㅠㅠ 개인적인 기호도 차이가 있을것 같은 제품이에요...
---------
생각한 맛이 아니에요 저는 단호박이 더 맛있더라구요
---------
생각보다 맛이 아주 좋아요^^
---------
정말 누룽지 같아요. 달지 않아서 좋고요, 생각보다 든든합니다.
---------
사진 죽이 너무 짜요. 세상에 공복에 그 죽 먹고, 점심 전에 물을 1리터를 마셨어요. 
호박100프로라고해서 건강식인줄알았는데 정백당,정제염,덱스트린에
분유까지ㅜㅜ 이건 그냥 편의점에서 파는 캔 호박죽수준입니다. 혹시나 저처럼 건강식생각하고 간편죽 주문하시는거면 비추입니다.비싸서 좋은건가했는데 유통기한이 1년이나. ㅜㅜ 이건 그냥
레또르트식품인거같아요.
---------
정말 누룽지 같아요. 달지 않아서 좋고요, 생각보다 든든합니다.
---------
아주 잘 샀습니다. 확실히 

In [55]:
meals_merged = itkshop.merge_list(itkshop.meals)
morning_merged = itkshop.merge_list(itkshop.morning)
doctorNuts_merged = itkshop.merge_list(itkshop.doctorNuts)

In [59]:
meals_merged

['주변',
 '이야기',
 '즐겨',
 '구매',
 '상품',
 '뚜껑',
 '손힘',
 '열려요',
 '커터칼',
 '씨앗',
 '맛',
 '다른',
 '알갱이',
 '있',
 '맛',
 '맛있',
 '체식',
 '품류',
 '맛있',
 '맛있',
 '맛있',
 '주문',
 '단종',
 '기대',
 '주문',
 '우유',
 '마셨',
 '한끼',
 '단맛',
 '있',
 '설탕',
 '필요',
 '없',
 '단점',
 '있',
 '떠먹',
 '물',
 '우유',
 '마시',
 '건더기',
 '때문',
 '넘김',
 '불편',
 '건강',
 '넘김',
 '불편',
 '건과',
 '건더기',
 '가루',
 '바래',
 '보내',
 '사진',
 '휴대',
 '있',
 '편하',
 '맛',
 '적응',
 '보충제',
 '맛있',
 '사진',
 '편하',
 '해야',
 '세일',
 '주문',
 '사진',
 '간지',
 '동생',
 '허세',
 '허세',
 '맛',
 '식감',
 '취향',
 '타사',
 '제품',
 '제품',
 '검색',
 '맛',
 '칼로리',
 '다이어트',
 '아침',
 '한끼',
 '마시',
 '맛있',
 '물',
 '굿',
 '굿',
 '불러요',
 '맛있',
 '들어간',
 '내용물',
 '부담',
 '맛',
 '괜찮',
 '패키지',
 '바꾸',
 '가격',
 '올라서',
 '별로',
 '미숫가루',
 '맛있',
 '밥',
 '이러',
 '생각',
 '아침대용',
 '예정',
 '맛있',
 '기존',
 '나와서',
 '맛있',
 '아침',
 '생각',
 '맛',
 '없',
 '맛있',
 '배송',
 '빠르',
 '상품',
 '빠른',
 '배송',
 '감사',
 '코코넛',
 '씹히',
 '맛',
 '달',
 '봉지',
 '적힌',
 '비율',
 '우유',
 '봉지',
 '정도',
 '맛있',
 '기존',
 '나와서',
 '맛있',
 '아침',
 '건강',
 '맛',
 '다니',
 '편해요',
 '속',
 '아침',
 '식사',
 '해결'

In [56]:
tfidf = SB_Tfidf([meals_merged, morning_merged, doctorNuts_merged])
tfidf.get_tfidf()
tfidf_of_all = tfidf.tfidf_hangul

for i in tfidf_of_all:
    pprint(i[:40])
    print()

[('코코넛', 0.4953933559163108),
 ('보틀', 0.380185598726471),
 ('식사', 0.3444100182259241),
 ('핑크', 0.2764986172556153),
 ('하프', 0.19585318722272752),
 ('프레시모닝', 0.14977008434679162),
 ('소이', 0.11480333940864136),
 ('저녁', 0.11480333940864136),
 ('마시', 0.11055136387498797),
 ('라이트', 0.1062993883413346),
 ('오리지널', 0.1062993883413346),
 ('속', 0.09354346174037442),
 ('차지', 0.09216620575187176),
 ('초코', 0.09216620575187176),
 ('대용', 0.08929148620672105),
 ('귀찮', 0.08503951067306767),
 ('점심', 0.08503951067306767),
 ('선식', 0.08078753513941428),
 ('파우치', 0.08078753513941428),
 ('딸기맛', 0.0806454300328878),
 ('영양소', 0.0806454300328878),
 ('말차', 0.06912465431390383),
 ('카카오', 0.06912465431390383),
 ('땅콩', 0.06803160853845414),
 ('그린', 0.06377963300480075),
 ('운동', 0.06377963300480075),
 ('배부르', 0.05952765747114737),
 ('녹차', 0.057603878594919854),
 ('딸기', 0.057603878594919854),
 ('카페라떼', 0.057603878594919854),
 ('텀블러', 0.057603878594919854),
 ('가볍', 0.0510237064038406),
 ('다이어트식', 0.0510237064038406),


In [58]:
itkinsta.meals=[]
itkinsta.morning=[]
itkinsta.doctorNuts=[]

In [59]:
for idx in itkinsta.df.index:
    if '밀스' in itkinsta.df.loc[idx, 'nav_list']:
        itkinsta.meals.append(itkinsta.df.loc[idx,'nav_list'])
    elif '모닝죽' in itkinsta.df.loc[idx, 'nav_list']:
        itkinsta.morning.append(itkinsta.df.loc[idx,'nav_list'])
    elif '견과류' in itkinsta.df.loc[idx, 'nav_list']:
        itkinsta.doctorNuts.append(itkinsta.df.loc[idx,'nav_list'])
    else:
        pass

In [60]:
insta_meals_merged = itkinsta.merge_list(itkinsta.meals)
insta_morning_merged = itkinsta.merge_list(itkinsta.morning)
insta_doctorNuts_merged = itkinsta.merge_list(itkinsta.doctorNuts)

In [61]:
tfidf = SB_Tfidf([insta_meals_merged, insta_morning_merged, insta_doctorNuts_merged])
tfidf.get_tfidf()
tfidf_of_all = tfidf.tfidf_hangul

for i in tfidf_of_all:
    pprint(i[:40])
    print()

[]

[]

[('견과류', 0.5530525411473949),
 ('다이어트', 0.43867780096246245),
 ('간식', 0.23743616949783444),
 ('맛', 0.20413719450728451),
 ('아침', 0.19110716081533016),
 ('선물', 0.18676381625134542),
 ('식단', 0.18531603473001715),
 ('건강', 0.1549126227821237),
 ('맛있', 0.15056927821813892),
 ('소통', 0.1303003369195433),
 ('그램', 0.12595699235555852),
 ('스타그램', 0.11582252170626071),
 ('식사', 0.11003139562094769),
 ('저녁', 0.11003139562094769),
 ('다이어터', 0.10713583257829115),
 ('점심', 0.09410579888633683),
 ('식단인증', 0.0912102358436803),
 ('오늘', 0.0912102358436803),
 ('운동', 0.08976245432235205),
 ('있', 0.08976245432235205),
 ('일상', 0.0810757651943825),
 ('일기', 0.06515016845977165),
 ('요거트', 0.06080682389578688),
 ('우유', 0.05791126085313036),
 ('견과', 0.05501569781047384),
 ('푸드', 0.05501569781047384),
 ('아몬드', 0.053567916289145576),
 ('먹스타그램', 0.05067235324648906),
 ('없', 0.05067235324648906),
 ('하루', 0.04922457172516081),
 ('고구마', 0.04777679020383254),
 ('영양', 0.04488122716117603),
 ('추석', 0.044881227161176

## csv에 안쓰고 이런식으로도 추가 가능. 하지만 csv에 자동저장되진 않는다.

단어_태그_T/F(받침여부)_원단어의발음

## m이 False인 경우에는 진짜 맛있다는 것.
## -> '없' 이라는 음절이 '있'이라는 음절 이후 4음절 내에 없다는 뜻이다.
## m이 True인 경우에는 애매한 상황이다. '없' 이라는 음절이 4음절 내에 존재한다는 것이다.

In [20]:
import re
m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있진 않다.. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())

m = re.search('맛.{0,5}있.{0,5}[없않]', '안녕하세요. 맛이 있다. 다만 아쉬운 것은 포장지가 없다.')
print(m)
print('맛있다는 뜻' if m == None else '맛 없다는 뜻\n원문: ' + m.group())


<_sre.SRE_Match object; span=(7, 14), match='맛이 있진 않'>
맛 없다는 뜻
원문: 맛이 있진 않
None
맛있다는 뜻
