# 프랜차이즈 카페별 대표 음료 찾기

---

In [1]:
#데이터 처리
import re
import sys
import csv
import nltk
import numpy as np
import pandas as pd
from collections import Counter
from pypapago import Translator
translator = Translator()

In [2]:
#데이터 크롤링
import time
import datetime
import urllib.parse
from bs4 import BeautifulSoup
from urllib.request import urlopen
from tqdm.notebook import tqdm
import GetOldTweets3 as got

In [3]:
#데이터 분석
import sentifish
from sentifish import sentTokenizer, wordTokenizer, Sentiment
from afinn import Afinn
afinn = Afinn(language='en')

In [4]:
#그래프 및 시각화
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interactive
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
%matplotlib inline

font = font_manager.FontProperties(fname = "GmarketSansMedium.otf").get_name()
rc('font', family=font, size=20)

---

In [5]:
def readDrink(name):
    with open("./common/"+name+'.csv', 'r',encoding='CP949') as f:
        csv_f = pd.read_csv(f, names=["음료명","언급빈도"])
        csv_f["음료명"] = csv_f["음료명"].str.replace(pat=r'\n', repl=r'', regex=True)
        csv_f["언급률"] = pd.Series((csv_f['언급빈도']/csv_f['언급빈도'].sum()))
        
        csv_f=csv_f.replace({"핫초콜릿":"핫초코","캐모마일":"카모마일","모히토라떼":"모히또라떼","레몬차":"레몬티","카라멜마키아또":"카라멜마끼아또","아포가토":"아포가또","자허블" : "자몽허니블랙티", "카페아메리카노":"아메리카노", "앗메리카노":"아메리카노","빽스라떼":"빽's라떼"})
            
        grouped = csv_f.groupby(by=csv_f["음료명"])
        csv_f = grouped.sum()
        csv_f = csv_f.sort_values(by=["언급빈도"],ascending=[False])
        csv_f= csv_f.reset_index()
        csv_f["카페명"] = name
        csv_f = csv_f[["카페명","음료명","언급빈도","언급률"]]
        
        csv_f.to_csv("./common/"+name+"_df"+".csv",encoding='CP949',header=True,index=False)
        
    return csv_f[:12]

In [6]:
def makePlot(name):
    df = readDrink(name)
    series = pd.Series(["기타", (1-(df["언급률"].sum()))*100,1- df["언급률"].sum()])
    df3=pd.DataFrame([list(series)],columns=["음료명","언급빈도","언급률"])
    df2 = pd.concat([df, df3],ignore_index=True)
    df4 = df2["언급률"] 

    plt.figure(figsize= (15,15))
    colors = {"#F3FFE3","#D9E3DA","#C2C2B4","#D7ECD9","#BFD1DF",
              "#CED2C2", "#D1CFC0","#FFFFD1"}
    df4.plot.pie(autopct='%1.2f%%', labels=df2["음료명"],fontsize=13,colors=colors, pctdistance=1.05, labeldistance=1.15, rotatelabels  = 45)
    
    
    plt.title(name+" 음료 언급률", position=(0.1, 1.0), fontsize=25)
    plt.axis('off')
    
    return plt.show()

In [7]:
# 카페 + 음료명 검색 후 포스팅 건수 찾기

cafedropdown = widgets.Dropdown(options= [' ','스타벅스','앤제리너스','파스쿠찌','이디야','탐앤탐스','투썸플레이스','할리스','커피빈','빽다방'], description = '카페명 : ')
display(cafedropdown)

Dropdown(description='카페명 : ', options=(' ', '스타벅스', '앤제리너스', '파스쿠찌', '이디야', '탐앤탐스', '투썸플레이스', '할리스', '커피빈', '…

In [None]:
# 카페별 대표 음료 찾기

cafe = str(cafedropdown.value)

file = open('./common/'+cafe+'.csv', 'w', encoding='CP949',newline='')
data = csv.writer(file)    

with open("./menu/"+cafe+'.txt', 'r',encoding='utf-8') as menu:
    lines = menu.readlines()
    for line in tqdm(lines):
        menu = str(line)
        
        url= "https://search.naver.com/search.naver?sm=tab_hty.top&where=post&query="+ urllib.parse.quote(cafe)+"+"+ urllib.parse.quote(menu)+"&oquery=%27ascii%27+codec+can%27t+encode+characters+in+position+46-49%3A+ordinal+not+in+range%28128%29&tqi=UWXugwprvhGssltLPCsssssssWw-264951"
        page = urlopen(url).read()
        soup = BeautifulSoup(page, "html.parser")
        text = soup.find('span','title_num')
        text = str(text)

        text = re.sub('1-10 / ','',text,0).strip()
        text = re.sub(',','',text,0).strip()
        text = re.sub('건','',text,0).strip()
        text = re.sub('<.+?>','',text,0).strip()
        text = int(text)
        
        data.writerow([menu, text])

file.close()

outputdrink=widgets.Output()
ouputpie=widgets.Output()

with outputdrink:
    display(readDrink(cafe))
with ouputpie:
    display(makePlot(cafe))
    
widgetchange=widgets.Tab([outputdrink, ouputpie])
widgetchange.set_title(0, cafe+' 대표 음료')
widgetchange.set_title(1, '그래프')

display(widgetchange)


In [8]:
file1 = pd.read_csv(open('./common/스타벅스_df.csv', 'r', encoding='CP949',newline=''))
file2 = pd.read_csv(open('./common/이디야_df.csv', 'r', encoding='CP949',newline=''))
file3 = pd.read_csv(open('./common/빽다방_df.csv', 'r', encoding='CP949',newline=''))
file4 = pd.read_csv(open('./common/탐앤탐스_df.csv', 'r', encoding='CP949',newline=''))
file5 = pd.read_csv(open('./common/커피빈_df.csv', 'r', encoding='CP949',newline=''))
file6 = pd.read_csv(open('./common/투썸플레이스_df.csv', 'r', encoding='CP949',newline=''))
file7 = pd.read_csv(open('./common/할리스_df.csv', 'r', encoding='CP949',newline=''))
file8 = pd.read_csv(open('./common/앤제리너스_df.csv', 'r', encoding='CP949',newline=''))
file9 = pd.read_csv(open('./common/파스쿠찌_df.csv', 'r', encoding='CP949',newline=''))

df = pd.concat([file1, file2,file3, file4, file5, file6, file7, file8, file9],ignore_index=True)
df.to_csv('./common/sum_df.csv', encoding='CP949')

In [9]:
def showbarh (dataframe):
    df=pd.DataFrame(dataframe,columns=['카페명','언급률'])
    cafename = dataframe["카페명"]
    yticks=df["카페명"]
    index = np.arange(len(yticks))

    drinkingplot = df.plot.barh(figsize=(10, 6), color='#FFABAB', label=cafename, width=0.3)
    plt.title(dataframe["음료명"][0]+", 카페별 언급 비율을 알아보자!", position=(0.5, 1.1))
    plt.gca().invert_yaxis()

    plt.xlabel("언급비율")
    plt.ylabel("카페명")
    plt.yticks(index, yticks)

    plt.box(False)
    plt.show()

In [10]:
# 음료별 대표 카페 찾기

f=open('./common/sum_df.csv', 'r', encoding='CP949',newline='')
sumDf = pd.read_csv(f)

drink = sumDf['음료명'].unique().tolist()
item = [' ']+sorted(drink)

dropdown= widgets.Dropdown(options= item,description = '음료명 : ' )
outputDf= widgets.Output()
outputPlot=widgets.Output()

def dropdown_control(change):
    outputDf.clear_output()
    outputPlot.clear_output()
    with outputDf:
        display(sumDf.loc[sumDf['음료명']==change.new].sort_values(by=["언급률"],ascending=[False]).reset_index())
    
    with outputPlot:
        display(showbarh(sumDf.loc[sumDf['음료명']==change.new].sort_values(by=["언급률"],ascending=[False]).reset_index()))
    

dropdown.observe(dropdown_control, names='value')
display(dropdown)

widgetchanges=widgets.Tab([outputDf, outputPlot])
widgetchanges.set_title(0, ' 맛있는 카페')
widgetchanges.set_title(1, '그래프')

display(widgetchanges)

f.close()

Dropdown(description='음료명 : ', options=(' ', '1837블랙티', '감귤당근주스', '겐마이차그린', '고구마라떼', '고구마스무디', '고흥유자차', '골드키위주…

Tab(children=(Output(), Output()), _titles={'0': ' 맛있는 카페', '1': '그래프'})

In [11]:
#네이버 & 트위터 크롤링 진행할 카페 선택

dropdown1 = widgets.Dropdown(options= [' ','스타벅스','앤제리너스','파스쿠찌','이디야','탐앤탐스','투썸플레이스','할리스','커피빈','빽다방'], description = "크롤링 : ")
display(dropdown1)

Dropdown(description='크롤링 : ', options=(' ', '스타벅스', '앤제리너스', '파스쿠찌', '이디야', '탐앤탐스', '투썸플레이스', '할리스', '커피빈', '…

In [None]:
#트위터 크롤링 (open source)

day = []
tweetList = []

dates = [datetime.datetime.strptime("2020-05-06", "%Y-%m-%d") + 
                  datetime.timedelta(days=x) 
                  for x in range(0, (datetime.datetime.strptime("2020-06-07", "%Y-%m-%d")
                                     -datetime.datetime.strptime("2020-05-06", "%Y-%m-%d")).days)]

for date in tqdm(dates):
    day.append(date.strftime("%Y-%m-%d"))
    
dict3={"스타벅스":"스타벅스 OR 스벅", "이디야": "이디야", "투썸플레이스":"투썸", "파스쿠찌": "파스쿠찌 OR 파스쿠치","앤제리너스":"앤제리너스 OR 엔젤리너스", 
       "할리스": "할리스", "탐앤탐스":"탐앤탐스 OR 탐탐", "커피빈": "커피빈", "빽다방":"빽다방"} 

cafedicnum=str(dropdown1.value)
search = str(dict3[cafedicnum])

runTweet = got.manager.TweetCriteria().setQuerySearch(search)\
                                           .setSince(day[0])\
                                           .setUntil((datetime.datetime.strptime(day[-1], "%Y-%m-%d") 
                                            + datetime.timedelta(days=1)).strftime("%Y-%m-%d"))\
                                           .setMaxTweets(-1)

tweet = got.manager.TweetManager.getTweets(runTweet)
print("크롤링한 트위터 갯수: " , len(tweet))

In [None]:
for index in tqdm(tweet):
    contents = index.text
    tweetList.append(contents)
    
tweetfile = open("./twitter/"+cafedicnum+'_twitter.txt','w',encoding='UTF-8')

for i in range(len(tweetList)):
    text = "{}".format(tweetList[i])
    text=re.sub('[^가-힣0-9]+',' ',text).strip()
    text=re.sub('  ',' ',text,0).strip()
    tweetfile.write(text)
    tweetfile.write("\n")
    
tweetfile.close()

In [None]:
#네이버 블로그 제목 크롤링

cafe = urllib.parse.quote(str(dropdown1.value))
dataframe=pd.DataFrame()

blogfile = open("./blog/"+cafedicnum+'_blog.txt','w',encoding='UTF-8')

findurl = "https://search.naver.com/search.naver?sm=tab_hty.top&where=post&query="+cafe+"+"+"&oquery=%27ascii%27+codec+can%27t+encode+characters+in+position+46-49%3A+ordinal+not+in+range%28128%29&tqi=UWXugwprvhGssltLPCsssssssWw-264951"

for i in tqdm(range (1,500)):
    url= findurl +"&start="+ str(i)
    page = urlopen(url).read()
    soup = BeautifulSoup(page, "html.parser")
    if (page != None):
        text = soup.find('a','sh_blog_title')
        text= str(text.contents)
        text=re.sub('<.+?>','',text,0).strip()
        text=re.sub('[^가-힣0-9]+',' ',text).strip()
        text=re.sub('  ',' ',text,0).strip()
        blogfile.write(text)
        blogfile.write("\n")
        
    elif (page == None):
        break
        
blogfile.close()

In [None]:
# 네이버 + 트위터 콘텐츠 영어로 번역 (open source)

temp = []

with open('./blog/'+str(dropdown1.value)+'_blog.txt', 'r', encoding='UTF-8') as blog:
        lines = blog.readlines()
        for line in lines:
            line = line.replace("\n",'')
            line= line.replace('\n\n','')
            temp.append(line)
            
with open('./twitter/'+str(dropdown1.value)+'_twitter.txt', 'r', encoding='UTF-8') as twitter:
        lines = twitter.readlines()
        for line in lines:
            line = line.replace("\n",'')
            line= line.replace('\n\n','')
            temp.append(line)

blog.close()
twitter.close()

with open('./translated/'+str(dropdown1.value)+ '_translated.txt', 'w',  encoding='UTF-8') as file:
    for i in tqdm(range(len(temp))):
        result = translator.translate(temp[i], source='ko', target='en', verbose=False)
        file.write(result)
        file.write('\n')
        

file.close()

In [12]:
# 감성분석 진행할 카페 선택

cafedropdown = widgets.Dropdown(options= [' ','스타벅스','앤제리너스','파스쿠찌','이디야','탐앤탐스','투썸플레이스','할리스','커피빈','빽다방'], description = '분석 카페명 : ')
display(cafedropdown)

Dropdown(description='분석 카페명 : ', options=(' ', '스타벅스', '앤제리너스', '파스쿠찌', '이디야', '탐앤탐스', '투썸플레이스', '할리스', '커피빈'…

In [None]:
# 감성분석 (open source)

wf = open('./sentiment/'+str(cafedropdown.value)+'_sentiment.csv','w',encoding='UTF8')
with open("./translated/"+str(cafedropdown.value)+ '_translated.txt', 'r',encoding='UTF-8') as f:
    lines = [lines.rstrip() for lines in f]
    
    for line in lines:
        line = re.sub(',','',line,0).strip()
        sent = sentTokenizer(line)
        words =wordTokenizer(sent)
        score= afinn.score(line)
        
        if(score > 0):
            wf.write((line)+','+"Positive"+','+str(score))
            wf.write('\n')
        elif(score  < 0):
            wf.write((line)+','+"Negative"+','+str(score))
            wf.write('\n')
        elif(score  == 0 ):
            wf.write((line)+','+"Neutral"+','+str(score))
            wf.write('\n')
        
f.close()
wf.close()

In [13]:
sentiment = pd.read_csv('./sentiment/'+str(cafedropdown.value)+'_sentiment.csv',header=0)
sentiment.columns = ['의견','감성분석','점수']
sentiment = sentiment.sort_values(by="점수", ascending=True)
sentiment

Unnamed: 0,의견,감성분석,점수
852,I'm so upset and there's a lot of people study...,Negative,-12.0
1456,Keithmi Hiroyin Make Smooth Liquid Eyeliner 04...,Negative,-12.0
1378,I don't know what to eat and I'm going to go c...,Negative,-8.0
1245,I'm reminded of Harliss in Stuck Pre-Qancy. Sh...,Negative,-7.0
569,Don't come to the cafe if you can't put syrup ...,Negative,-7.0
...,...,...,...
1168,My friend asked me why I haven't been dating t...,Positive,13.0
1178,There's a lot of people. Wow I don't think we'...,Positive,14.0
1317,Today 9 500 won for lunch 8 won for Halis and ...,Positive,16.0
1323,The combination of melon and coconut is delici...,Positive,16.0


In [14]:
sentiment1 = pd.read_csv(open('./sentiment/'+'스타벅스'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment2 = pd.read_csv(open('./sentiment/'+'파스쿠찌'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment3 = pd.read_csv(open('./sentiment/'+'탐앤탐스'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment4 = pd.read_csv(open('./sentiment/'+'빽다방'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment5 = pd.read_csv(open('./sentiment/'+'할리스'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment6 = pd.read_csv(open('./sentiment/'+'투썸플레이스'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment7 = pd.read_csv(open('./sentiment/'+'이디야'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment8 = pd.read_csv(open('./sentiment/'+'앤제리너스'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)
sentiment9 = pd.read_csv(open('./sentiment/'+'커피빈'+'_sentiment.csv', 'r', encoding='utf-8',newline=''),header=None)

sentdf = pd.concat([sentiment1, sentiment2,sentiment3, sentiment4,sentiment5, sentiment6, sentiment7, sentiment8,sentiment9])
sentdf.columns=['의견','감성분석','점수']
sentdf.to_csv('./sentiment/sum_df.csv', encoding='utf-8')


In [15]:
def sentimentPlot(name):
    counts = sentiment['감성분석'].value_counts()
    colors = ["#B99AFF","#D8C7FF","#E8DEFF"]
    counts.plot(kind='pie',autopct='%1.1f%%', fontsize=17,colors=colors)
    plt.figure(figsize= (10,10))
    
    return plt.show()

In [16]:
def negativeDF(name):
    temp=[]
    
    if (name == 'sentdf'):
        negative = sentdf.loc[sentdf["감성분석"]=='Negative', :]
    else:
        negative = sentiment.loc[sentiment["감성분석"]=='Negative', :]

        
    negative = negative.sort_values(by="점수", ascending=True) 
    for opinion in tqdm(negative["의견"]):
        temp.append(str(opinion))

    temp=str(temp)
    words=wordTokenizer(temp)
    words = [word for word in words if len(word)>1]
    tagged = nltk.pos_tag(words)
    words = [word for word, pos in tagged if pos in['JJ']]
    words = [word.lower() for word in words]

    word_list=[]
    for word in words:
        if(afinn.score(word)<0):
            word_list.append(word)
        
    counts = Counter(word_list)
    common = counts.most_common()
    df= pd.DataFrame(common)
    df.columns=["단어","빈도"]
    df['비율']=df["빈도"]/df["빈도"].sum()
    return df

def negativeBar(name):
    df=negativeDF(name).head(15)
    df = df.loc[:,["단어","비율"]]
    df = df.rename(columns={"비율":name})
    df2=negativeDF('sentdf')
    df2 = df2.loc[:,["단어","비율"]]
    df2 = df2.rename(columns={"비율":"평균"})
    df3 = df2[(df2['단어'].isin(df['단어']))].reset_index(drop=True)
    df4= pd.merge(df,df3, on="단어")
    
    plt.rcParams['figure.figsize']=[12,12]
    ax = df4.plot.bar(x="단어", rot=0, color = {"#C5CAE9","#D7ECD9"}, fontsize=11)
    plt.title(name + " 감성 분석 : 부정적 단어 비율", fontsize=20)
    return plt.show()


def positiveDF(name):
    temp=[]
    
    if (name == 'sentdf'):
         positive = sentdf.loc[sentdf["감성분석"]=='Positive', :]
    else:
        positive = sentiment.loc[sentiment["감성분석"]=='Positive', :]
        

    positive = positive.sort_values(by="점수", ascending=False)

    for opinion in tqdm(positive["의견"]):
        temp.append(str(opinion))

    temp=str(temp)
    words=wordTokenizer(temp)
    words = [word for word in words if len(word)>1]
    tagged = nltk.pos_tag(words)
    words = [word for word, pos in tagged if pos in['JJ']]
    words = [word.lower() for word in words]

    word_list=[]

    for word in words:
        if(afinn.score(word)>0):
            word_list.append(word)
        
    counts = Counter(word_list)
    common = counts.most_common()
    df= pd.DataFrame(common)
    df.columns=["단어","빈도"]
    df['비율']=df["빈도"]/df["빈도"].sum()
    return df

def positiveBar(name):
    df=positiveDF(name).head(15)
    df = df.loc[:,["단어","비율"]]
    df = df.rename(columns={"비율":name})
    df2=positiveDF('sentdf')
    df2 = df2.loc[:,["단어","비율"]]
    df2 = df2.rename(columns={"비율":"평균"})
    df3 = df2[(df2['단어'].isin(df['단어']))].reset_index(drop=True)
    df4= pd.merge(df,df3, on="단어")
    
    plt.rcParams['figure.figsize']=[12,12]
    ax = df4.plot.bar(x="단어", rot=0, color = {"#C5CAE9","#D7ECD9"}, fontsize=11)
    plt.title(name + " 감성 분석 : 긍정적 단어 비율", fontsize=20)
    return plt.show()

In [17]:
cafe=str(cafedropdown.value)

outputP=widgets.Output()
outputPplot=widgets.Output()
outputN=widgets.Output()
outputNplot=widgets.Output()
outputAll=widgets.Output()

with outputAll:
    display(sentimentPlot(cafe))
with outputN:
    display(negativeDF(cafe))
with outputNplot:
    display(negativeBar(cafe))
with outputP:
    display(positiveDF(cafe))
with outputPplot:
    display(positiveBar(cafe))
    
widgetchange=widgets.Tab([outputAll,outputN, outputNplot,outputP, outputPplot])

widgetchange.set_title(0, cafe+' 감성분석')
widgetchange.set_title(1, cafe+' 부정적 단어')
widgetchange.set_title(2, '그래프')
widgetchange.set_title(3, cafe+' 긍정적 단어')
widgetchange.set_title(4, '그래프')

display(widgetchange)

Tab(children=(Output(), Output(), Output(), Output(), Output()), _titles={'0': '할리스 감성분석', '1': '할리스 부정적 단어', …