## libraries
Google drive mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# libraries
import requests
import os
import pickle
import re
import pandas as pd
from tqdm.notebook import tqdm
import ast
from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns

## load data

In [None]:
file_path = '/your_path'

In [1]:
# 키워드 추출된 review 데이터셋 로드
# csv로 load시
review = pd.read_csv(file_path+'/file_name.csv')  # csv로 load시
review['keywords_ug'] = review['keywords_ug'].apply(lambda x : ast.literal_eval(x))  # 리스트 형태로 변환해줘야 함
review['keywords_bg'] = review['keywords_bg'].apply(lambda x : ast.literal_eval(x))

In [None]:
# json으로 load시
review = pd.read_json(file_path+'/file_name.json', lines=True)  # 별도 변환 필요 없음

## Extract Topics by openAI API

In [None]:
!pip install openai
from openai import OpenAI
clear_output()

### test

In [None]:
# 1개 케이스 테스트 해보기
keywords = [ 'stars', 'tchopstix', 'dining', 'order', 'sushi', 'vegetables', 'server', 'sushi', 'rice', 'sauces', 'roll', 'street', 'reviews', 'sushi', 'express', 'healthy', 'tea', 'strawberry', 'vanilla']

prompt = f"""Keyword List : {set(keywords)}
        These keywords are related to the review of restaurants or food category.
        You have to work on the following requirements:
        1. Group keywords in Keyword List based on similarity. *Every keyword must belong to a topic*.
        2. Name each group and make it the upper topic.
        3. Consider fixed topics for suggested categories, like 'food', 'service', 'atmosphere'.
         - food : such as food, beverage related to menu, taste and nutritional values.
         - service : such as assistance, benefits, duties, delivery, parking, utility, appropriateness, and courtesy related to cumstomer experiment.
         - atmosphere : overall environment such as dining space, mood, interior furniture, lighting, outdoor dining, terrace etc.
         For any keywords not related to these topics, feel free to categorize them as you see fit.
        4. especially, any keywords that seem related to store names, reviews, or stars are not included in the fixed topics;
           instead, they are included in 'Others'
        5. After naming, return the text in Python dictionary format : key is topic, values are keywords
        """

api_key = "your_key" # API key 발급
client = OpenAI(api_key=api_key)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",  # 빠른 text 추출을 위해 3.5 turbo 사용(+비용 문제..)
    response_format={ "type": "text" },
    messages=[
        {"role": "user", "content": prompt},])

test = response.choices[0].message.content
print(test)

{
    "food": ["tea", "vegetables", "sushi", "roll", "vanilla", "rice", "sauces", "strawberry"],
    "service": ["server", "order", "express"],
    "atmosphere": ["dining", "street"],
    "others": ["stars", "tchopstix", "healthy", "reviews"]
}


### extract

In [None]:
# 결과 담을 리스트 생성
result = []

In [None]:
# 중간에 끊기면 pickle 파일 다시 load
with open(file_path+'/topics.pkl', 'rb') as file :
  result = pickle.load(file)
len(result)

In [None]:
api_key = 'your_key'

start, end = 0, len(review)  # 필요시 끊어서 설정

with tqdm(total=end-start) as pbar :
  for idx, row in review.loc[start:end, :].iterrows() :
    if idx - start < len(result) : # 이미 추출했으면 pass
      pbar.update(1)
      continue
    else:
      prompt = f"""Keyword List: {(row['keywords_bg'])}
          These keywords are related to the review of restaurants or food category.
          You have to work on the following requirements:
          1. Group keywords in Keyword List based on similarity. *Every keyword must belong to a topic*.
          2. Name each group and make it the upper topic.
          3. Consider fixed topics for suggested categories, like 'food', 'service', 'atmosphere', 'facility', 'price', and 'others'.
            - food: Keywords related to menu items to eat, taste, beverages, and nutritional values.
            - service: Keywords related to customer comport and assistance, delivery(i.e. 'deli', 'uber'), utilities, appropriateness, and courtesy in operation.
            - atmosphere: Keywords related to the overall dining environment and mood, including interior design, furniture, lighting, and outdoor spaces.
            - facility: Keywords related to the physical amenities of the restaurant, such as cleanliness, parking, restrooms and equipment.
            - price: Keywords related to the cost, pricing, value for money, affordability, and overall expense of the dining experience.
            - others: Keywords that don't fit into the above categories, especially those related to store names, reviews, or star ratings.
          4. After naming, return the text in Python dictionary format: keys are topics, values are keywords. *Set values in list type*
          5. If some topic hasn't any keyword, *Don't include that topic key in dictionary*.
          """

      client = OpenAI(api_key=api_key)

      response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "text" },
        messages=[
            {"role": "user", "content": prompt},])

      temp = response.choices[0].message.content.strip()
      result.append({row['review_id']: temp})

      # 20개씩 저장
      if len(result)%20 == 0 :
        with open(file_path + '/topics.pkl', 'wb') as file :
          pickle.dump(result, file)
      pbar.update(1)
  # 끝나고 저장
  with open(file_path + '/topics.pkl', 'wb') as file :
    pickle.dump(result, file)

## Post Processing
* text형식으로 추출하다보니 형식에 맞지 않는 case들 후처리 필요해짐

In [None]:
# 필요 시 로드
with open(file_path + '/topics.pkl', 'rb') as file :
  topic = pickle.load(file)

# 런타임 살아있으면 변수할당
# topic = result

In [None]:
# 예외처리할 case들 변환해주기
def postprocessing(topic, post_topic) :
  for i in range(len(topic)) :
    key = list(topic[i].keys())[0]
    values = list(topic[i].values())[0]
    values = re.sub('\n','',values)   # 줄바꿈 기호 모두 제거
    try:
      post_topic[key] = ast.literal_eval(values)
    except Exception as e:
      try:
        # key가 문자열로 안 들어가 있어서(''로 안묶임) 에러남
        values = re.sub(r"(\w+\s*\w*):", r"'\1':", values)
        # didn't나 ''처럼 작은따옴표가 중첩되는 경우도 오류남 후...
        values = re.sub(r"(\w)\'(\w)", r"\1\2", values)
        # 이상하게 코드로 들어간 경우
        values = re.sub(r'^```python|```$', '', values)
        # 왜 대괄호로 안 묶여있니..
        if ('{' not in values) or ('}' not in values):
          values = '{'+ values +'}'
        # 콤마로 구분 안 되어 있으면...
        if '],' not in values :
          values = re.sub(r"(\])\s*(\')", r"\1,\2", values)
        post_topic[key] = ast.literal_eval(values)
      # 위 case들 해당 안 되면 error
      except Exception as e:
        post_topic[key] = {'error':e}
        # break
  return post_topic

* 1차 후처리

In [None]:
# 리뷰 id와 매핑할 딕셔너리 생성
post_topic = {}
post_topic = postprocessing(topic, post_topic)
len(post_topic)

30880

In [None]:
post_topic

{1443659: {'service': ['deli uber'], 'others': ['check person']},
 1044648: {'food': ['creme brulee',
   'hh dishes',
   'blueberry mojito',
   'asian nachos',
   'entree'],
  'facility': ['parking sux'],
  'others': ['nice feel',
   'brunch great',
   'bar area',
   'dinner good',
   'place celebrate']},
 5693786: {'others': ['logo restaurant'],
  'facility': ['open 24'],
  'food': ['menu breakfast']},
 1166589: {'service': ['servers watched',
   'group slow',
   'came group',
   'going slooooww'],
  'food': ['running food', 'food okay'],
  'others': ['felt bad']},
 6678286: {'food': ['bags chips', 'tasty good', 'philly club'],
  'service': ['deli', 'come hungry'],
  'others': ['yuge']},
 2462810: {'service': ['order waited', 'worst service'],
  'food': ['going cheddars'],
  'facility': ['cleaning 8pm']},
 2725854: {'food': ['great steaks',
   'steaks rib',
   'enjoyed fries',
   'taste beef',
   'meat prime',
   'steak house'],
  'service': ['service good', 'knows doing', 'beef servi

* 2차 후처리

In [None]:
# 위에서 error 난 case들 따로 살펴보기
error_id = []
for id, tops in post_topic.items() :
  if list(post_topic[id].keys())[0] == 'error':
    error_id.append(id)
print(len(error_id))

In [None]:
error_id

[6503328, [1723154, 4123109, 1739912, 2483163, 401869, 6268559]]

In [None]:
# 1) keyword가 topic에 배정 안 된 경우
sorted([6503328, 4123109,1739912, 2483163, 401869])

# 2) 키워드들이 모두 쪼개진 경우 : 1460566 ~ 475723
error_id[2:22]
# ===> 다시 뽑기

# 3) 앞뒤로 ```로 묶인 경우
# 4) 작은 따옴표 2개 묶인 경우('')
# ===> 정규식으로 처리

In [None]:
re_topic = []

for idx, row in review.iterrows() :
    if row['review_id'] in error_id:
      prompt = f"""Keyword List: {(row['keywords_bg'])}
          These keywords are related to the review of restaurants or food category.
          You have to work on the following requirements:
          1. Group keywords in Keyword List based on similarity. *Every keyword must belong to a topic*.
          2. Name each group and make it the upper topic.
          3. Consider fixed topics for suggested categories, like 'food', 'service', 'atmosphere', 'facility', 'price', and 'others'.
            - food: Keywords related to menu items to eat, taste, beverages, and nutritional values.
            - service: Keywords related to customer comport and assistance, delivery(i.e. 'deli', 'uber'), utilities, appropriateness, and courtesy in operation.
            - atmosphere: Keywords related to the overall dining environment and mood, including interior design, furniture, lighting, and outdoor spaces.
            - facility: Keywords related to the physical amenities of the restaurant, such as cleanliness, parking, restrooms and equipment.
            - price: Keywords related to the cost, pricing, value for money, affordability, and overall expense of the dining experience.
            - others: Keywords that don't fit into the above categories, especially those related to store names, reviews, or star ratings.
          4. After naming, return the text in Python dictionary format: keys are topics, values are keywords. *Set values in list type*
          5. If some topic hasn't any keyword, *Don't include that topic key in dictionary*.
          """
      client = OpenAI(api_key=api_key)

      response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "text" },
        messages=[
            {"role": "user", "content": prompt},])

      temp = response.choices[0].message.content.strip()
      re_topic.append({row['review_id']: temp})
re_topic

[{1278962: "{\n    'others': ['review ice', 'window workers', 'extremely disappointed'],\n    'service': ['closing early'],\n    'atmosphere': ['sad'],\n    'food': ['minutes close'],\n    'others': ['hope']\n}"},
 {6112977: '{\n    "food": ["ordered calzone", "calzone pretty", "hit spot", "cheese fries", "haven\'t tried"],\n    "service": ["order greeted"],\n    "others": ["asian supermarket", "sure soon"]\n}'},
 {3394205: "{\n    'food': ['fried cod', 'food keeps'],\n    'service': ['staff friendly'],\n    'atmosphere': ['atmosphere cool'],\n    'others': ['honey pit']\n}"},
 {4681402: "{\n    'food': ['sunrise margaritas'],\n    'others': ['place chilis', 'bad experience', 'coming chilis'],\n    'facility': ['bathrooms nice']\n}"},
 {2715842: '{\n    "food": ["roast croissant", "delicious", "cbd options"],\n    "others": ["way town", "barista kind", "local definitely"]\n}'},
 {1306218: "{\n    'food': ['chargrilled oysters'],\n    'others': ['ladies fantastic', 'hour celebrate', 'co

In [None]:
# 다시 추출한 topic도 후처리
post_topic = postprocessing(re_topic, post_topic)
len(post_topic)

30880

* 3차 후처리

In [None]:
# case 1) topic에 keyword 없는 애들(빈 topic)
# case 2) 모든 문자열이 쪼개진 아이들(결과 참고)
post_topic[6112977]

{'food': ['v', 'e', 'n', 'y', 'u', 'p', 'z', 'h', 'l', 'a', 'k'],
 'atmosphere': ['r', 's'],
 'facility': [',', 'f', 'd', 'o'],
 'others': ['[', ']']}

In [None]:
# case 1) 빈 topic 제거
for key, val in post_topic.items():
  for k in list(val.keys()):
    if not val[k] or (isinstance(val[k], list) and '' in val[k]):  # 빈 값이거나 빈 문자열을 가졌을 때
      val.pop(k)

In [None]:
# case2) 후처리할 인덱스(리뷰id) 담을 리스트
filtered_indices = []

# 필터링 조건을 검사하는 함수
def is_invalid_topic(post_topic):
  for key, values in post_topic.items():
    if any(len(value.strip()) <= 1 for value in values):  # 공백이거나 길이가 1 이하인(쪼개진) 항목이 있는지 검사
      return True
    if not values:  # 빈 리스트인 경우
      return True
    if any(char in values for char in ['[', ']', "'", ",", " "]):  # 특수문자가 있는지 검사
      return True
  return False

for key, val in post_topic.items():
  if is_invalid_topic(val) :
    filtered_indices.append(key)

print("Filtered Indices:", filtered_indices)

In [None]:
re_topic = []

for idx, row in review.iterrows() :
    if row['review_id'] in filtered_indices :
      prompt = f"""Keyword List: {(row['keywords_bg'])}
          These keywords are related to the review of restaurants or food category.
          You have to work on the following requirements:
          1. Group keywords in Keyword List based on similarity. *Every keyword must belong to a topic*.
          2. Name each group and make it the upper topic.
          3. Consider fixed topics for suggested categories, like 'food', 'service', 'atmosphere', 'facility', 'price', and 'others'.
            - food: Keywords related to menu items to eat, taste, beverages, and nutritional values.
            - service: Keywords related to customer comport and assistance, delivery(i.e. 'deli', 'uber'), utilities, appropriateness, and courtesy in operation.
            - atmosphere: Keywords related to the overall dining environment and mood, including interior design, furniture, lighting, and outdoor spaces.
            - facility: Keywords related to the physical amenities of the restaurant, such as cleanliness, parking, restrooms and equipment.
            - price: Keywords related to the cost, pricing, value for money, affordability, and overall expense of the dining experience.
            - others: Keywords that don't fit into the above categories, especially those related to store names, reviews, or star ratings.
          4. After naming, return the text in Python dictionary format: keys are topics, values are keywords. *Set values in list type*
          5. If some topic hasn't any keyword, *Don't include that topic key in dictionary*.
          """
      client = OpenAI(api_key=api_key)

      response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "text" },
        messages=[
            {"role": "user", "content": prompt},])

      temp = response.choices[0].message.content.strip()
      re_topic.append({row['review_id']: temp})
re_topic

In [None]:
# 다시 추출한 topic도 후처리
post_topic = postprocessing(re_topic, post_topic)
len(post_topic)

In [None]:
# pickle로 저장
with open( file_path + '/post_topic.pkl', 'wb') as file :
  pickle.dump(post_topic, file)

### Add Column

In [None]:
# review_id로 매핑하여 컬럼 추가
review['topics_raw'] = review['review_id'].map(topic)  # gpt로 뽑은 raw text
review['topics'] = review['review_id'].map(post_topic) # 정제한 topic 딕셔너리

# topic 순서 조정
ordered_key = ['food','service','atmosphere','facility','price','others']

review['topics'] = review['topics'].map(lambda d : {key: d[key] for key in ordered_key if key in d}  )
review['topics']

## with Sentiment Data
* roberta로 감성분석한 df에 문장별 keyword, topic 매핑하여 컬럼 추가

### Add Columns


In [None]:
roberta = pd.read_csv(file_path + '/file_name.csv')  # encoding = 'ISO-8859-1'
roberta

Unnamed: 0,review_id,stars,sentence,compound_score,positive_score,neutral_score,negative_score,keywords_ug,keywords_bg,topic
0,1443659,5,Found 51st Deli on Uber Eats and we loved it!!!,0.989771,0.991007,0.007758,0.001236,uber,deli uber,service
1,1443659,5,I can't wait to check it out in person!,0.985175,0.986700,0.011776,0.001525,wait,check person,others
2,1044648,5,Love love love this place for HH n dinner is g...,0.989841,0.991794,0.006252,0.001954,dinner,dinner good,others
3,1044648,5,"Its ok for brunch, not as great as what people...",-0.041061,0.229077,0.500784,0.270139,brunch,brunch great,others
4,1044648,5,The Asian Nachos are the dish to get for HH.,0.397284,0.408822,0.579640,0.011538,nachos,asian nachos,food
...,...,...,...,...,...,...,...,...,...,...
224195,5916423,1,Won't go back.,-0.121095,0.092444,0.694016,0.213540,won,won,others
224196,1847631,5,Coma is my go to coffee shop.,0.906624,0.908743,0.089138,0.002119,coffee,coma coffee,atmosphere
224197,1847631,5,Their food and beverages are consistently deli...,0.985733,0.987232,0.011269,0.001499,dairy,vegan vegetarian,food
224198,1847631,5,Online ordering is super easy or you can go in...,0.910056,0.912051,0.085953,0.001995,order,online ordering,service


In [None]:
cols = ['review_id', 'sentence', 'keywords_ug', 'keywords_bg', 'topic']
data = {col: [] for col in cols}

with tqdm(total=len(review)) as pbar :
  for idx, row in review.iterrows():
    row['keywords_ug'] = ast.literal_eval(row['keywords_ug'])
    row['keywords_bg'] = ast.literal_eval(row['keywords_bg'])
    row['sentences'] = ast.literal_eval(row['sentences'])
    for col in cols :
      if col == 'review_id':
        # int는 list로 감싸줘야 extend 가능
        # 문장 수만큼 review_id 반복 추가
        data[col].extend([row[col]]*len(row['sentences']))
      # topics는 dict 타입으로 각 keyword가 포함된 topic을 순서대로 담음
      elif col == 'topics' :
        topics = row[col] # 토픽dict
        for kw in row['keywords_bg'] :
          assigned_topic = None   # 할당된 토픽 초기화
          for key, val in topics.items():
            if kw in val :
              assigned_topic = key
              break # 찾으면 다른 토픽 찾기 멈춤
          if assigned_topic :
              data[col].append(assigned_topic)
          else: # 할당된 토픽 없으면..?
              data[col].append(None)
      else : # id, topic 제외한 컬럼들
        if isinstance(data[col], list) :
          data[col].extend(row[col])
        else :
          data[col].append(row[col])
    pbar.update(1)

kw_topic_df = pd.DataFrame(data)
kw_topic_df

  0%|          | 0/30880 [00:00<?, ?it/s]

Unnamed: 0,review_id,sentences,keywords_ug,keywords_bg,topics
0,1443659,Found 51st Deli on Uber Eats and we loved it!!!,uber,deli uber,service
1,1443659,I can't wait to check it out in person!,wait,check person,others
2,1044648,Love love love this place for HH n dinner is g...,dinner,dinner good,others
3,1044648,"Its ok for brunch, not as great as what people...",brunch,brunch great,others
4,1044648,The Asian Nachos are the dish to get for HH.,nachos,asian nachos,food
...,...,...,...,...,...
228492,5916423,Won't go back.,won,won,others
228493,1847631,Coma is my go to coffee shop.,coffee,coma coffee,atmosphere
228494,1847631,Their food and beverages are consistently deli...,dairy,vegan vegetarian,food
228495,1847631,Online ordering is super easy or you can go in...,order,online ordering,service


In [None]:
review = review[~(review['review_id']==4344482)]  # 중국어 리뷰 삭제
kw_topic_df = kw_topic_df[~(kw_topic_df['review_id']==4344482)]
kw_topic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 228496 entries, 0 to 228496
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_id    228496 non-null  int64 
 1   sentence     228496 non-null  object
 2   keywords_ug  228496 non-null  object
 3   keywords_bg  228496 non-null  object
 4   topic        219889 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.5+ MB


In [None]:
# roberta df에 합치기
kw_top_filter = kw_topic_df[['keywords_ug','keywords_bg','topic']]

roberta.reset_index(drop=True, inplace=True)
kw_top_filter.reset_index(drop=True, inplace=True)

concat_df = pd.concat([roberta, kw_top_filter], axis=1)
concat_df

Unnamed: 0,review_id,stars,sentence,compound_score,positive_score,neutral_score,negative_score,keywords_ug,keywords_bg,topic
0,1443659,5.0,Found 51st Deli on Uber Eats and we loved it!!!,0.989771,0.991007,0.007758,0.001236,uber,deli uber,service
1,1443659,5.0,I can't wait to check it out in person!,0.985175,0.986700,0.011776,0.001525,wait,check person,others
2,1044648,5.0,Love love love this place for HH n dinner is g...,0.989841,0.991794,0.006252,0.001954,dinner,dinner good,others
3,1044648,5.0,"Its ok for brunch, not as great as what people...",-0.041061,0.229077,0.500784,0.270139,brunch,brunch great,others
4,1044648,5.0,The Asian Nachos are the dish to get for HH.,0.397284,0.408822,0.579640,0.011538,nachos,asian nachos,food
...,...,...,...,...,...,...,...,...,...,...
228491,5916423,1.0,Won't go back.,-0.121095,0.092444,0.694016,0.213540,won,won,others
228492,1847631,5.0,Coma is my go to coffee shop.,0.906624,0.908743,0.089138,0.002119,coffee,coma coffee,atmosphere
228493,1847631,5.0,Their food and beverages are consistently deli...,0.985733,0.987232,0.011269,0.001499,dairy,vegan vegetarian,food
228494,1847631,5.0,Online ordering is super easy or you can go in...,0.910056,0.912051,0.085953,0.001995,order,online ordering,service


In [None]:
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224200 entries, 0 to 224199
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   review_id       224200 non-null  int64  
 1   stars           224200 non-null  int64  
 2   sentence        224200 non-null  object 
 3   compound_score  224194 non-null  float64
 4   positive_score  224194 non-null  float64
 5   neutral_score   224194 non-null  float64
 6   negative_score  224174 non-null  float64
 7   keywords_ug     224199 non-null  object 
 8   keywords_bg     224200 non-null  object 
 9   topic           224200 non-null  object 
dtypes: float64(4), int64(2), object(4)
memory usage: 17.1+ MB


### missing topics

In [None]:
# topic이 비어있는 경우 발견
missing = concat_df[concat_df['topic'].isna()]
missing

Unnamed: 0,review_id,stars,sentence,compound_score,positive_score,neutral_score,negative_score,keywords_ug,keywords_bg,topic
14,5693786,5.0,Unless it was three guys putting their money t...,-0.181468,0.031053,0.756426,0.212521,business,business,
151,1564953,4.0,Tough to find good salads in this part of town...,0.967932,0.970039,0.027855,0.002107,salads,salads town,
191,6408608,1.0,We will not be back.,-0.602414,0.029966,0.337654,0.632380,,,
262,6124071,5.0,Made our Friday night experience wonderful.,0.982624,0.983850,0.014924,0.001226,friday,experience wonderful,
298,4158750,5.0,I thought that being in Seminole Heights parki...,-0.323043,0.068373,0.540212,0.391416,parking,seminole heights,
...,...,...,...,...,...,...,...,...,...,...
228361,2358023,5.0,I ordered Hummus and Babaganoush.,0.132698,0.154546,0.823606,0.021848,hummus,ordered hummus,
228362,2358023,5.0,Both came with fresh warm pita wedges in ample...,0.799628,0.801389,0.196849,0.001762,wedges,pita wedges,
228367,6399853,1.0,After never notifying me before I show up to p...,-0.766449,0.009753,0.214046,0.776201,chicken,restaurant hour,
228467,2367318,3.0,Most of the people stand and drink though.,-0.189541,0.042031,0.726397,0.231572,stand,stand drink,


In [None]:
# 다시 추출
result = []
with tqdm(total=len(missing)) as pbar :
  for idx, row in missing.iterrows() :
    if not row['topic']: # topic이 비어있고
      if row['keywords_bg'] : # keyword가 있으면
        prompt = f""" Please categorize this keyword '{row['keywords_bg']}' into one of these 6 topics.
        topics : food, service, atmosphere, facility, price, others
            - food: Keywords related to menu items to eat, taste, beverages, and nutritional values.
            - service: Keywords related to customer comport and assistance, delivery(i.e. 'deli', 'uber'), utilities, appropriateness, and courtesy in operation.
            - atmosphere: Keywords related to the overall dining environment and mood, including interior design, furniture, lighting, and outdoor spaces.
            - facility: Keywords related to the physical amenities of the restaurant, such as cleanliness, parking, restrooms and equipment.
            - price: Keywords related to the cost, pricing, value for money, affordability, and overall expense of the dining experience.
            - others: Keywords that don't fit into the above categories, especially those related to store names, reviews, or star ratings.
        Just say one of them by topic name.
        """
      else: # keyword 없으면
        prompt = f"""1. Please summarize this sentence between 1-gram and 2-gram : {row['sentence']}
        2. Please categorize this summarized keyword into one of these 6 topics.
        topics : food, service, atmosphere, facility, price, others
            - food: Keywords related to menu items to eat, taste, beverages, and nutritional values.
            - service: Keywords related to customer comport and assistance, delivery(i.e. 'deli', 'uber'), utilities, appropriateness, and courtesy in operation.
            - atmosphere: Keywords related to the overall dining environment and mood, including interior design, furniture, lighting, and outdoor spaces.
            - facility: Keywords related to the physical amenities of the restaurant, such as cleanliness, parking, restrooms and equipment.
            - price: Keywords related to the cost, pricing, value for money, affordability, and overall expense of the dining experience.
            - others: Keywords that don't fit into the above categories, especially those related to store names, reviews, or star ratings.
        3. Return keyword and topic name in tuple format : (keyword, topic)
        """

    client = OpenAI(api_key=api_key)

    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      response_format={ "type": "text" },
      messages=[
          {"role": "user", "content": prompt},])

    temp = response.choices[0].message.content.strip()

    result.append({idx : [row['review_id'], temp]})
    pbar.update(1)

  0%|          | 0/7545 [00:00<?, ?it/s]

In [None]:
result

[{14: [5693786, 'Others']},
 {151: [1564953, 'food']},
 {191: [6408608,
   '1. Summarized keyword: not back\n2. Category: others\n3. Tuple format: (not back, others)']},
 {262: [6124071, 'atmosphere']},
 {298: [4158750, 'others']},
 {302: [5518616, 'others']},
 {308: [4524098, 'others']},
 {338: [2385094, 'service']},
 {450: [4601575, 'food']},
 {452: [4601575, 'facility']},
 {485: [209152, 'food']},
 {522: [4143501, 'food']},
 {590: [660439, 'atmosphere']},
 {594: [660439,
   '1. Summarized keyword: "Not much"\n2. Categorized topic: Others\n3. Result: ("Not much", "others")']},
 {602: [660439, 'food']},
 {614: [3485250, 'food']},
 {615: [3485250, 'others']},
 {616: [3485250, 'food']},
 {633: [4066113,
   '1. Summarized keyword: We will be back\n2. Topic: others\n3. Tuple format: (We will be back, others)']},
 {711: [3680804,
   '1. Summarized keyword: go back\n2. Topic: others\n3. Tuple: (go back, others)']},
 {714: [3322390, 'others']},
 {763: [4942479, 'others']},
 {764: [4942479, '

In [None]:
def extract_and_correct_tuple(text):
  # 튜플 형식을 찾는 정규식 패턴
  match = re.search(r'\(\s*["\']?(.+?)["\']?\s*,\s*["\']?(.+?)["\']?\s*\)', text)
  if match:
    part1 = match.group(1).strip()
    part2 = match.group(2).strip()

    # 따옴표로 이미 묶여 있는지 확인
    if not (part1.startswith("'") or part1.startswith('"')):
      part1 = f'"{part1}"'
    if not (part2.startswith("'") or part2.startswith('"')):
      part2 = f'"{part2}"'
    # 추출한 부분을 따옴표로 묶어줌
    corrected_tuple_str = f"({part1}, {part2})"

    try:
      return ast.literal_eval(corrected_tuple_str)
    except (ValueError, SyntaxError) as e:
      # print(f"Error parsing tuple: {e}")
      return None

  return None

clear_result = []

for r in result:
  for idx, val in r.items():
    if isinstance(val[1], str):  # 문자열인 경우만 처리
      extract = extract_and_correct_tuple(val[1])
      if extract:
        clear_result.append({idx: [val[0], extract]})
      else:
        clear_result.append({idx: [val[0], val[1]]})
    else:
      clear_result.append({idx: val})
clear_result

[{14: [5693786, 'Others']},
 {151: [1564953, 'food']},
 {191: [6408608, ('not back', 'others')]},
 {262: [6124071, 'atmosphere']},
 {298: [4158750, 'others']},
 {302: [5518616, 'others']},
 {308: [4524098, 'others']},
 {338: [2385094, 'service']},
 {450: [4601575, 'food']},
 {452: [4601575, 'facility']},
 {485: [209152, 'food']},
 {522: [4143501, 'food']},
 {590: [660439, 'atmosphere']},
 {594: [660439, ('Not much', 'others')]},
 {602: [660439, 'food']},
 {614: [3485250, 'food']},
 {615: [3485250, 'others']},
 {616: [3485250, 'food']},
 {633: [4066113, ('We will be back', 'others')]},
 {711: [3680804, ('go back', 'others')]},
 {714: [3322390, 'others']},
 {763: [4942479, 'others']},
 {764: [4942479, 'service']},
 {771: [4942479, 'facility']},
 {862: [1164182, 'food']},
 {882: [1164182, ('$8 each', 'price')]},
 {922: [4619744, ('go back', 'others')]},
 {940: [2900385, 'food']},
 {958: [2900385, 'food']},
 {966: [2900385, 'others']},
 {1072: [6221927, 'food']},
 {1085: [1102848, 'food']}

In [None]:
for res in clear_result:
  for index, value in res.items():
    review_id = value[0]
    if isinstance(value[1], tuple):  # 키워드와 토픽이 모두 주어진 경우
      if len(value[1]) == 2:
        keyword, topic = value[1]
        missing.loc[index, 'keywords_ug'] = keyword  # 적절한 키워드 컬럼 업데이트
        missing.loc[index, 'keywords_bg'] = keyword  # 적절한 키워드 컬럼 업데이트
        missing.loc[index, 'topic'] = topic
      else:
        print(f"Unexpected tuple length at index {index}: {value[1]}")
    else:  # 토픽만 주어진 경우
      topic = value[1]
      missing.loc[index, 'topic'] = topic
missing

Unnamed: 0,review_id,stars,sentence,compound_score,positive_score,neutral_score,negative_score,keywords_ug,keywords_bg,topic
14,5693786,5.0,Unless it was three guys putting their money t...,-0.181468,0.031053,0.756426,0.212521,business,business,Others
151,1564953,4.0,Tough to find good salads in this part of town...,0.967932,0.970039,0.027855,0.002107,salads,salads town,food
191,6408608,1.0,We will not be back.,-0.602414,0.029966,0.337654,0.632380,not back,not back,others
262,6124071,5.0,Made our Friday night experience wonderful.,0.982624,0.983850,0.014924,0.001226,friday,experience wonderful,atmosphere
298,4158750,5.0,I thought that being in Seminole Heights parki...,-0.323043,0.068373,0.540212,0.391416,parking,seminole heights,others
...,...,...,...,...,...,...,...,...,...,...
228361,2358023,5.0,I ordered Hummus and Babaganoush.,0.132698,0.154546,0.823606,0.021848,hummus,ordered hummus,food
228362,2358023,5.0,Both came with fresh warm pita wedges in ample...,0.799628,0.801389,0.196849,0.001762,wedges,pita wedges,food
228367,6399853,1.0,After never notifying me before I show up to p...,-0.766449,0.009753,0.214046,0.776201,chicken,restaurant hour,others
228467,2367318,3.0,Most of the people stand and drink though.,-0.189541,0.042031,0.726397,0.231572,stand,stand drink,food


In [None]:
concat_df.update(missing) # concat_df에 업데이트

In [None]:
concat_df = concat_df[concat_df['sentence'].str.len() >= 10] # 10자 이상 문장만 filter

In [None]:
concat_df['topic'] = concat_df['topic'].apply(lambda x : x.lower())
concat_df['topic'].unique()

array(['service', 'others', 'food', 'facility', 'price', 'atmosphere'],
      dtype=object)

In [None]:
# 저장
concat_df.to_csv(file_path+'/file_name.csv', index=False)

## One-Hot Encoding

In [None]:
review = pd.read_csv(file_path+'file_name.csv')

In [None]:
review['topics'] = review['topics'].apply(lambda x : ast.literal_eval(x) )
review['sentences'] = review['sentences'].apply(lambda x : ast.literal_eval(x) )
review['keywords_bg'] = review['keywords_bg'].apply(lambda x : ast.literal_eval(x) )
review['keywords_ug'] = review['keywords_ug'].apply(lambda x : ast.literal_eval(x) )

In [None]:
%pip install tensorflow



In [None]:
from sklearn.preprocessing import OneHotEncoder

* roberta에서 삭제한 문장 review 테이블에서 drop
* roberta에서 추가한 keyword 및 topic도 추가

In [None]:
roberta_id = roberta.groupby('review_id').agg(
    stars = ('stars', 'mean'),
    sentences = ('sentence', list),
    keywords_ug = ('keywords_ug', list),
    keywords_bg = ('keywords_bg', list),
    topics = ('topic', list),
    compound_score = ('compound_score', "mean"),
    positive_score = ('positive_score',list),
    neutral_score = ('neutral_score',list),
    negagtive_score = ('negative_score', list)
).reset_index()

In [None]:
keywords_bg_data = {}
keywords_ug_data = {}
topic_data = {}
sentnece_data = {}
compound_scores = {}
for idx, row in roberta_id.iterrows():
  review_id = row['review_id']
  topic = row['topics']
  kw_ug = row['keywords_ug']
  kw_bg = row['keywords_bg']
  cpd = row['compound_score']

  sentnece_data[review_id] = row['sentences']
  keywords_ug_data[review_id] = kw_ug
  keywords_bg_data[review_id] = kw_bg
  topic_data[review_id] = topic
  compound_scores[review_id] = cpd

print(keywords_bg_data, compound_scores, sep = '\n')

{21: -0.12010664674999999, 507: 0.35890004659999997, 784: -0.388022717125, 950: 0.9609176500000001, 987: 0.93381697, 1118: 0.7943305475, 1926: 0.5604522192666667, 2032: 0.54132665, 2070: 0.844994815, 2593: -0.5997639033333334, 3231: 0.910299796, 3363: 0.6571870066666666, 3622: 0.4517557239090909, 3729: 0.9783269, 3779: -0.653888876, 4326: 0.86581286, 4395: 0.7761788677777778, 4602: 0.699582595, 5072: 0.94167445, 5637: -0.08841773399999998, 5662: 0.7525010757333334, 5681: -0.41979495333333333, 6093: 0.6628195144285715, 6540: -0.44980176, 6864: -0.16913001057142857, 7199: -0.44596627375, 7384: -0.04750204499999999, 7443: 0.7431502444444446, 7463: -0.38259554966666665, 7529: 0.87897992, 7603: 0.32702578078571426, 7984: 0.9396274299999999, 8015: 0.04663270000000003, 8078: 0.5831613400000001, 8121: 0.66032988, 8242: -0.12728637066666665, 8283: 0.8847315, 8547: 0.2618977212307692, 8783: 0.4560755928333333, 8944: 0.49778758837500003, 9134: 0.4505865725, 9467: 0.8026139933333334, 10178: 0.1818

In [None]:
# roberta df에서 업데이트했던 keywords, topics 반영하여 review df 업데이트
review['topics'] = review['review_id'].map(topic_data)
review['keywords_ug'] = review['review_id'].map(keywords_ug_data)
review['keywords_bg'] = review['review_id'].map(keywords_bg_data)
review['compound_score_mean'] = review['review_id'].map(compound_scores)
review.head()

Unnamed: 0,index,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentences,sentence_counts,keywords_ug,keywords_bg,topics_raw,topics,compound_score_mean
0,184,1443659,607521,108373,5.0,0,0,0,Found 51st Deli on Uber Eats and we loved it!!...,2017-01-14 21:23:10,[Found 51st Deli on Uber Eats and we loved it!...,2,"[deli uber, check person]","[deli uber, check person]","{'service': ['deli uber'], 'others': ['check p...","[service, others]",0.987473
1,207,1044648,182108,134961,5.0,0,0,0,Love love love this place for HH n dinner is g...,2017-08-07 23:48:51,[Love love love this place for HH n dinner is ...,11,"[dinner good, brunch great, asian nachos, entr...","[dinner good, brunch great, asian nachos, entr...","{'food': ['creme brulee', 'hh dishes', 'bluebe...","[others, others, food, food, food, food, food,...",0.509072
2,237,5693786,749220,54049,5.0,0,0,0,Not sure the significance behind the name and ...,2018-09-25 14:49:55,[Not sure the significance behind the name and...,4,"[logo restaurant, business, menu breakfast, op...","[logo restaurant, business, menu breakfast, op...","{'food': ['menu breakfast'], 'facility': ['ope...","[others, others, food, facility]",0.339954
3,277,1166589,572170,9263,2.0,1,0,0,The service is going to be slooooww. And the f...,2017-10-02 01:52:47,"[The service is going to be slooooww., And the...",7,"[going slooooww, food okay, group slow, came g...","[going slooooww, food okay, group slow, came g...","{'food': ['running food', 'food okay'], 'servi...","[service, food, service, service, service, oth...",-0.207153
4,323,6678286,816928,108373,5.0,0,0,0,Much more than a deli! Wife and I had the Phil...,2017-12-29 20:18:59,"[Much more than a deli!, Wife and I had the Ph...",6,"[deli, philly club, tasty good, bags chips, co...","[deli, philly club, tasty good, bags chips, co...","{'food': ['bags chips', 'tasty good', 'philly ...","[service, food, food, food, service]",0.257147


In [None]:
review['sentence_counts'] = review['sentences'].apply(lambda x: len(x))

In [None]:
review.columns

Index(['index', 'review_id', 'user_id', 'business_id', 'stars', 'useful',
       'funny', 'cool', 'text', 'date', 'sentences', 'sentence_counts',
       'keywords_ug', 'keywords_bg', 'topics_raw', 'topics',
       'compound_score_mean'],
      dtype='object')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# MultiLabelBinarizer 객체 생성
mlb = MultiLabelBinarizer()

# 원핫 인코딩 수행
df_encoded = pd.DataFrame(mlb.fit_transform(review['topics']), columns=mlb.classes_)

# # 원래 데이터프레임과 결합
review = review.join(df_encoded)

# 결과 출력
new_order = ['index', 'review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date', 'sentences', 'sentence_counts',
             'keywords_ug', 'keywords_bg', 'compound_score_mean', 'topics_raw', 'topics']
review = review[ new_order + ['food', 'service', 'atmosphere', 'facility', 'price', 'others']]
review

Unnamed: 0,index,review_id,user_id,business_id,stars,useful,funny,cool,text,date,...,keywords_bg,compound_score_mean,topics_raw,topics,food,service,atmosphere,facility,price,others
0,184,1443659,607521,108373,5.0,0,0,0,Found 51st Deli on Uber Eats and we loved it!!...,2017-01-14 21:23:10,...,"[deli uber, check person]",0.987473,"{'service': ['deli uber'], 'others': ['check p...","[service, others]",0,1,0,0,0,1
1,207,1044648,182108,134961,5.0,0,0,0,Love love love this place for HH n dinner is g...,2017-08-07 23:48:51,...,"[dinner good, brunch great, asian nachos, entr...",0.509072,"{'food': ['creme brulee', 'hh dishes', 'bluebe...","[others, others, food, food, food, food, food,...",1,0,0,1,0,1
2,237,5693786,749220,54049,5.0,0,0,0,Not sure the significance behind the name and ...,2018-09-25 14:49:55,...,"[logo restaurant, business, menu breakfast, op...",0.339954,"{'food': ['menu breakfast'], 'facility': ['ope...","[others, others, food, facility]",1,0,0,1,0,1
3,277,1166589,572170,9263,2.0,1,0,0,The service is going to be slooooww. And the f...,2017-10-02 01:52:47,...,"[going slooooww, food okay, group slow, came g...",-0.207153,"{'food': ['running food', 'food okay'], 'servi...","[service, food, service, service, service, oth...",1,1,0,0,0,1
4,323,6678286,816928,108373,5.0,0,0,0,Much more than a deli! Wife and I had the Phil...,2017-12-29 20:18:59,...,"[deli, philly club, tasty good, bags chips, co...",0.257147,"{'food': ['bags chips', 'tasty good', 'philly ...","[service, food, food, food, service]",1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30874,2679757,5143901,1772582,22595,1.0,0,0,0,"Brand new but filthy. Trash overflowing, dirty...",2021-11-07 23:54:35,...,"[brand new, trash overflowing, counter cleanin...",-0.604357,"{'facility': ['trash overflowing', 'counter cl...","[others, facility, facility, price, others]",0,0,0,1,1,1
30875,2679857,2367318,995712,133866,3.0,8,2,3,Stopped in for dinner and drinks on a Saturday...,2018-03-25 15:30:29,...,"[stopped dinner, parking sparse, smokers ruini...",-0.123799,"{'food': ['stopped dinner', 'boyfriend wings',...","[food, facility, facility, facility, food, atm...",1,1,1,1,0,1
30876,2679863,6085901,1283829,84658,5.0,1,0,2,"Came here for lunch with my girlfriend, place ...",2018-08-10 04:52:50,...,"[pancakes, blue berry, fruity drink]",0.681004,"{'food': ['pancakes', 'blue berry', 'fruity dr...","[food, food, food]",1,0,0,0,0,0
30877,2679972,5916423,1410939,123182,1.0,0,0,0,Used to go here every week. Won't go again. Ma...,2020-06-03 00:49:05,...,"[used week, won, bag leave, didn food, replace...",-0.420128,"{'food': ['replace meal'], 'service': ['asked ...","[others, others, others, others, food, service...",1,1,0,0,0,1


In [None]:
# 최종 저장
review.to_json(file_path+'/file_name.json', orient='records', lines=True)
review.to_csv(file_path+'/file_name.csv',index=False)