In [None]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### data load

In [None]:
file_path = '/your_path'

In [None]:
# 샘플링된 reveiw 데이터셋 불러오기
review_sample = pd.read_csv(file_path + '/file_name.csv')

## spaCy tokenizer
* nltk로 분리해보니 마침표 외 문장기호로 구분되거나 혹은 마침표가 없는 경우 잘 분리되지 않음
* spaCy의 경우 문장기호에 구애받지 않고 문맥을 고려한 문장 분리가 가능하여 대체 사용

In [None]:
# spacy library 설치 및 import
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from spacy.language import Language

# GPU 사용 설정(필수)
spacy.prefer_gpu()

# spaCy 모델 로드
nlp = spacy.load("en_core_web_sm")

### test
* 이상 케이스들이 잘 분리되는지 확인

In [None]:
# 1) (?)가 포함된 text
txt = """
Gotta give props to an "actual authentic Asian place" that puts out a great meal! Nothing better than having a high quality, fresh dish cooked the way it should be...original recipe. So many restaurants cater to Americans' lack of ability to accept what they do not find familiar. Serving sweet and sour chicken (?), and beef (?), teriyaki. Break the chains and have a go at this place! It's clean, fresh, friendly, fairly priced and has a great selection and best of all... it's the real deal. You'll stuff your face no matter how much you didn't intend to do so.
"""

doc = nlp(txt)
senten = [sent.text.strip() for sent in doc.sents]

# 결과 출력
senten

['Gotta give props to an "actual authentic Asian place" that puts out a great meal!',
 'Nothing better than having a high quality, fresh dish cooked the way it should be...original recipe.',
 "So many restaurants cater to Americans' lack of ability to accept what they do not find familiar.",
 'Serving sweet and sour chicken (?), and beef (?), teriyaki.',
 'Break the chains and have a go at this place!',
 "It's clean, fresh, friendly, fairly priced and has a great selection and best of all... it's the real deal.",
 "You'll stuff your face no matter how much you didn't intend to do so."]

In [None]:
# 2) 마침표가 여러개 찍힌 경우
txt = """
Brick and Mortar has been bookmarked for a long time, but due to the absurd amount of good restaurants in St Pete it took a while to get here...Big Mistake on our part as even in a great dining town like St Pete this one is a real standout...Located directly across Central from Cycle Brewing it takes up the bottom floor of an office building..the restaurant is not a big place, but when crowded they have a separate "room" they seat people in which is actually the lobby of the office building during business hours...I know the add on room sounds terrible, but that's where we sat and the extra space was actually quite nice...Brick and Mortar definitely has the hot concept in the restaurant business these days which is small plates with a drink menu specializing in Wine and Local Craft Beers...St Pete/Tampa Area has a lot of these type places, but Brick and Mortar does it better than most...went with a group of six who all like to order small plates and share so I was able to sample many dishes that included..Carpaccio with House Ravioli stuffed with a Poached Egg...Veal Meatballs over Creamy Parmesan Polenta...Slow Braised Octopus done 2 different ways...Curry and Spice Rubbed Smoked Pulled Pork Poutine..Carmelized Onion and Cheese Tart... and a Charred Romaine Heart....not a misfire was to be found in any of the six dishes with The Poutine Dish being my favorite...I like Poutine so much that The Canadians are close to making me a Honorary Citizen so I tend to order it the rare times I see it on menus...Brick and Mortars may be the best I ever had as it consisted of Truffle Fries covered with Red Curry Gravy..Smoked Pulled Pork..Queso Fresco...Pickled Onions..Cilantro..and Jalapeno...this dish just exploded with flavor...do yourself a favor if you go to Brick and Mortar and order this dish and you won't be sorry...besides the food they also had a fantastic wine and local craft beer selection and I was excited to be able to get a Cycle Cream and Sugar Please which is not available much outside the brewery..server was on point with good recommendations and was always there when we needed her...it was also a nice touch that the owner came by a few times to ask us how we were doing and how we enjoyed everything?..it's easy to see why everything is so good at Brick and Mortar and why it runs so smoothly when the owner is around and obviously cares so much..in a town full of great Restaurants Brick and Mortar is one of the best and I will definitely return sooner than later..this one rates an easy 5 stars on every Restaurant Measurable and deserves all it's success
"""

doc = nlp(text)
senten = [sent.text.strip() for sent in doc.sents]

# 결과 출력
senten

['Brick and Mortar has been bookmarked for a long time, but due to the absurd amount of good restaurants in St Pete it took a while to get here...',
 'Big Mistake on our part as even in a great dining town like St Pete this one is a real standout...Located directly across Central from Cycle Brewing it takes up the bottom floor of an office building..',
 'the restaurant is not a big place, but when crowded they have a separate "room" they seat people in which is actually the lobby of the office building during business hours...',
 "I know the add on room sounds terrible, but that's where we sat and the extra space was actually quite nice...",
 'Brick and Mortar definitely has the hot concept in the restaurant business these days which is small plates with a drink menu specializing in Wine and Local Craft Beers...',
 'St Pete/Tampa Area has a lot of these type places, but Brick and Mortar does it better than most...went with a group of six who all like to order small plates and share so 

In [None]:
# 3) 마침표 없는 경우
text  = """
I've been going to Primo's Hoagies for many many many years now probably since they've been open and you bring the location up in Bensalem PA before I moved in the Philadelphia I want in your tonight to get dinner yeah once in a while they get my food wrong and sometimes I may not be 100% clear where I forgot but I have not called the store in almost 3 years and complaining about anything I get chicken parm sandwich is there a lot I have some put lettuce tomato and onion on it and no sauce in the last couple years every time I come home not every time but when I come home sometimes there's sauce on there I asked him not to put spices on my sandwiches tonight when I was making my order I said to them I wanted to check in parm sandwich with lettuce tomato and onion on it and it didn't matter what kind of cheese donor spoke up and also he said don't put no red sauce on there he likes buffalo on the side well when I got home the sandwich was 100% wrong which was not a big deal but when I called they are which I feel that you have really awesome food there the owner was not nice hung up on me they offer me 10% off will my next sandwich than the off me 20% off after complaining a little bit then he said okay what will give you a free sandwich at that time my nerves were shot and I was overwhelmed and Ida said freak it are you just want to bring the sandwiches back to get my money back was it just wasn't worth it so I am running back over there and I am dropping the sandwiches off at the front door and leaving them near and I will never return there I am out 40 some dollars because prices went up what's the when I open the buffalo sauce it didn't smell or seem like the regular buffalo sauce I usually get there unless they didn't tell me that they might have changed their brands I'm on 100% sure it was fine last week when I was there and I'm not just saying this to complain I'm being honest I'm a long-term customers are for many years and I can't believe I just got treated the way I got treated at and I have enough going on my mind there's too much stuff going on in this world to be treated like crap and have the owner Tommy I always give the wrong order which I admit sometimes I don't give everything a hundred percent right that's why I had not called they are in the last three years and said anything about anything every time I get a turkey Hoagie there's always spices on there salt or whatever I asked him not to put it on it I don't want to call there and complain last time I got a chicken parm sandwich I guess for lettuce tomato and onion on it I did not call and complain and this time I felt you know what if it's going to cost me $47 out for three sandwiches especially when molds for a friend of mine I was going to say something about it this time I will never return to this location ever again they do make really great Hoagies don't get me wrong but how I was treated and how it has me shaking inside and they need to teach her staff to listen to a little bit more and maybe thinks we get done better I understand there's so much going on in the world right now I understand prices are going up and stuff like that I didn't mind paying the extra money I just kind of question is why so much 40 some dollars for 3 sandwich which I feel they owe me my money back and I will head to an email and email the head corporation on this one I thought the owner was very rude why you hanging up on me otherwise he's always been awesome to me and I feel horrible right now cuz I really like the guy and that really sucks I feel like I was treated like crap I understanding if I was treated a little bit different or if they would have said hey this brings if it would have been earlier I would have just brought the sandwich back and have them fix that but they were closing soon but I didn't need to be treated the way I was treated at that was the worst experience I ever had it any kind of restaurant or anything or any kind of sandwich plays if you could find something in there that I explain to you in the last 3 years I like you to show it to me because I have not as many times you got my sandwiches wrong I'm not said a bad word or anything I've always been nice polite and even tipped really upset with the owner for treating me that way like I said I will never return to this location never again and I will make it known to my friends and everybody that goes there I think was wrong and I think I owe an apology to me and my money back
"""

doc = nlp(text)
senten = [sent.text.strip() for sent in doc.sents]

# 결과 출력
senten

["I've been going to Primo's Hoagies for many many many years now probably since they've been open and you bring the location up in Bensalem PA before I moved in the Philadelphia.",
 'I went in tonight to get dinner.',
 'Yeah once in a while they get my food wrong',
 'and sometimes I may not be 100% clear where I forgot',
 'but I have not called the store in almost 3 years and complaining about anything.',
 'I get chicken parm sandwich is there a lot.',
 "I have some put lettuce tomato and onion on it and no sauce in the last couple years every time I come home not every time but when I come home sometimes there's sauce on there.",
 'I asked him not to put spices on my sandwiches.',
 "Tonight when I was making my order I said to them I wanted to check in parm sandwich with lettuce tomato and onion on it and it didn't matter what kind of cheese donor spoke up and also he said don't put no red sauce on there he likes buffalo on the side.",
 'Well when I got home the sandwich was 100% wro

### Function

In [None]:
from multiprocessing import Pool, cpu_count

# 문장분리 함수
def process_review(review):
  spacy.prefer_gpu() # GPU 설정
  nlp = spacy.load("en_core_web_sm")   # spaCy 로드
  doc = nlp(review)
  sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
  return sentences

def separate_sentence(df):
  with Pool(cpu_count()) as pool:
    # tqdm 설정
    results = list(tqdm(pool.imap(process_review, df['text']), total=len(df)))
  # 리뷰 id와 매핑하여 딕셔너리 형태로 저장
  sentences = {review_id: result for review_id, result in zip(df['review_id'], results)}
  return sentences

## Separate Sentences

In [None]:
rev_sentences = separate_sentence(review_sample)
rev_sentences

  0%|          | 0/30880 [00:00<?, ?it/s]

{1443659: ['Found 51st Deli on Uber Eats and we loved it!!!',
  "I can't wait to check it out in person!"],
 1044648: ['Love love love this place for HH n dinner is good too!',
  'Its ok for brunch, not as great as what people praise it to be.',
  'The Asian Nachos are the dish to get for HH.',
  'Its an entree itself!',
  'The other HH dishes are great too.',
  'The creme brulee is amazing as with their other desserts.',
  'The blueberry mojito is awesome along with the mango sling.',
  'Great place to celebrate birthdays and such.',
  'Nice feel to it.',
  'Bar area gets crowded as it gets later for HH.',
  'As usual around the mall, finding parking sux!'],
 5693786: ['Not sure the significance behind the name and logo of the restaurant.',
  'Unless it was three guys putting their money together to form this business',
  'The food was great/ staff friendly/ huge menu from breakfast /lunch/dinner to dessert.',
  'Open 24 hours will definitely be back next time in town'],
 1166589: ['T

In [None]:
# 필요 시 pickle로 저장
with open(file_path+'/sentences.pkl', 'wb') as f:
    pickle.dump(rev_sentences, f)

## Add Columns

In [None]:
# review_id와 매핑하여 sentences 컬럼 추가(list 형태)
review_sample['sentences'] = review_sample['review_id'].map(rev_sentences)
review_sample.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_sample['sentences'] = review_sample['review_id'].map(rev_sentences)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,text_length,sentences
184,1443659,607521,108373,5.0,0,0,0,Found 51st Deli on Uber Eats and we loved it!!...,2017-01-14 21:23:10,2,[Found 51st Deli on Uber Eats and we loved it!...
207,1044648,182108,134961,5.0,0,0,0,Love love love this place for HH n dinner is g...,2017-08-07 23:48:51,11,[Love love love this place for HH n dinner is ...
237,5693786,749220,54049,5.0,0,0,0,Not sure the significance behind the name and ...,2018-09-25 14:49:55,3,[Not sure the significance behind the name and...
277,1166589,572170,9263,2.0,1,0,0,The service is going to be slooooww. And the f...,2017-10-02 01:52:47,7,"[The service is going to be slooooww., And the..."
323,6678286,816928,108373,5.0,0,0,0,Much more than a deli! Wife and I had the Phil...,2017-12-29 20:18:59,6,"[Much more than a deli!, Wife and I had the Ph..."


In [None]:
# 기존에 추가되었던 text 및 sentenct 수는 삭제 후 업데이트
review_sample.drop(['text_length','sentence_length'], axis = 1, inplace=True)
review_sample['sentence_counts'] = review_sample['sentences'].apply(len)
review_sample.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_sample.drop(['text_length','sentence_length'], axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_sample['sentence_counts'] = review_sample['sentences'].apply(len)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentences,sentence_counts
184,1443659,607521,108373,5.0,0,0,0,Found 51st Deli on Uber Eats and we loved it!!...,2017-01-14 21:23:10,[Found 51st Deli on Uber Eats and we loved it!...,2
207,1044648,182108,134961,5.0,0,0,0,Love love love this place for HH n dinner is g...,2017-08-07 23:48:51,[Love love love this place for HH n dinner is ...,11
237,5693786,749220,54049,5.0,0,0,0,Not sure the significance behind the name and ...,2018-09-25 14:49:55,[Not sure the significance behind the name and...,4
277,1166589,572170,9263,2.0,1,0,0,The service is going to be slooooww. And the f...,2017-10-02 01:52:47,"[The service is going to be slooooww., And the...",7
323,6678286,816928,108373,5.0,0,0,0,Much more than a deli! Wife and I had the Phil...,2017-12-29 20:18:59,"[Much more than a deli!, Wife and I had the Ph...",6


In [None]:
# 저장
review_sample.to_csv(file_path + '/file_name.csv', index=False)
review_sample.to_json(file_path + '/file_name.json', orient='records')  # 필요 시 json으로 저장(리스트 데이터 형식 살리기 위해)