<a href="https://colab.research.google.com/github/Cutlets/NaverMovieScraping/blob/main/Naver_Movie_Collector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import requests
import urllib.request
import pandas as pd
import numpy as np
import re
import json
import os
import os.path
import pprint as pp
import unicodedata
import shutil
import csv
import torch

from csv import reader
from collections import OrderedDict
from bs4 import BeautifulSoup
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast
from tokenizers import SentencePieceBPETokenizer
from transformers import BartForConditionalGeneration

In [None]:
# Text-Summarization With KoBERT
# Code from here : https://huggingface.co/spaces/gogamza/kobart-summarization
def tokenizer():
    tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-summarization')
    return tokenizer

def get_model():
    model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-summarization')
    model.eval()
    return model

model = get_model()
tokenizer = tokenizer()

def tsum(txt):
  summ = "NaN"
  if txt:
    raw_input_ids = tokenizer.encode(txt)
    input_ids = [tokenizer.bos_token_id] + \
        raw_input_ids + [tokenizer.eos_token_id]
    summary_ids = model.generate(torch.tensor([input_ids]),
                                 max_length=256,
                                 early_stopping=True,
                                 repetition_penalty=2.0)
    summ = tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)
  else:
    summ = "NaN"
  return summ

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
# Variable Setting #

# Option
debugMode = False
yearList = [2021, 2020, 2019, 2018, 2017, 2016]
loadMovieData = True
loadMovieData_DebugMode = False
importZip = True

# Debug Mode Variables
reduceYearPage = 0
reduceReviewPage = 0
yRange = len(yearList)
if debugMode:
  reduceYearPage = 26
  reduceReviewPage = 9
  yRange = 1
  loadMovieData = False or loadMovieData_DebugMode
  importZip = False
    
# If has a start index. 0 = None
minIndex = 142620

# NSMC Raw Path
os.makedirs('./movie_data/', exist_ok=True)
rawPath = "./movie_data/"
rawList = os.listdir(rawPath)

# Naver movie URL
nmURL = "https://movie.naver.com/movie/bi/mi/basic.naver?code="
nmReviewURL = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code="
nmReviewURL2 = "&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false"
nmYearURL = "https://movie.naver.com/movie/sdb/browsing/bmovie.naver?open="

nmPage = "&page="
maxYearPage = 31 - reduceYearPage # N-1 is max / Max Value = 31
maxReviewPage = 11 - reduceReviewPage # N-1 is max



In [None]:
# Functions

###########################
########### UNUSED FUNCTION
# Extract the movie ID from raw json lists
def rawToMovieId(rawLst):
  mLst = []
  mid = 0

  for raw in rawLst:
    mid = re.sub('\.json', '', raw)
    mLst.append(int(mid))

  mLst.sort()

  return mLst

# Read raw jsons
def readJson(movieLst):
  json_collection = []
  for mid in tqdm(movieLst):
    with open(rawPath+str(mid)+'.json', 'r') as f:
      json_data = json.load(f)
    json_collection.append(json_data)
  print()
  return json_collection
########### UNUSED FUNCTION
###########################

# Scraping
def getMoviedata(mid):
  filepath = './movie_data/'+str(mid)+'.json'
  if os.path.isfile(filepath) and loadMovieData:
    #print("Already exist json : ", filepath)
    with open(filepath, 'r') as f:
      json_data = json.load(f)
      return json_data

  URL = nmURL + str(mid)
  movieDetail = requests.get(URL)
  movieReview = requests.get
  soup = BeautifulSoup(movieDetail.text, 'html.parser')
  
  # Rating
  selector = '#pointNetizenPersentBasic'
  if(len(soup.select(selector)) > 0):
    rank = soup.select(selector)[0].text
    rank = re.sub('^관람객 평점','', rank)
    rank = re.sub('점','', rank)
    rank = rank.strip()
  else:
    rank = "NaN"

  # Genre
  selector = '#content > div.article > div.mv_info_area > div.mv_info > dl > dd:nth-of-type(1) > p > span:nth-of-type(1)'
  if(len(soup.select(selector)) > 0):
    genre = soup.select(selector)[0].text
    genre = re.sub('\r|\t|\n', '', genre)
    genre = genre.strip()
  else:
    genre = "NaN"

  # PG
  selector = '#content > div.article > div.mv_info_area > div.mv_info > dl > dd:nth-of-type(4) > p > a:nth-of-type(1)'
  if(len(soup.select(selector)) > 0):
    pg = soup.select(selector)[0].text
    pg = pg.strip()
  else:
    pg = "NaN"

  # Summary
  selector = '#content > div.article > div.section_group.section_group_frst > div:nth-of-type(1) > div > div > p'
  if(len(soup.select(selector)) > 0):
    syn = soup.select(selector)[0].text
    syn = unicodedata.normalize("NFKD", syn)
    syn = re.sub('\r', '', syn)
    syn = syn.strip()
  else:
    syn = "NaN"

  # Title
  selector = '#content > div.article > div.wide_info_area > div.mv_info > h3 > a'
  if(len(soup.select(selector)) > 0):
    title = soup.select(selector)[0].text
    title = title.strip()
  else:
    title = "NaN"
  
  # Review own check
  selector = '#pointNetizenCountBasic > em'
  if(len(soup.select(selector)) > 0):
    hasReview = True
  else:
    hasReview = False

  # Related movie
  selector = '#content > div.article > div:nth-of-type(5) > div:nth-of-type(2) > div > ul > li > a.title_mv'
  rMovieList = []
  if(len(soup.select(selector)) > 0):
    for mv in soup.select(selector):
      rMovieList.append(mv.text)
  else:
    rMovieList = np.nan
  
  movie_data = OrderedDict()
  movie_data["Title"] = title
  movie_data["MovieID"] = mid
  movie_data["Genre"] = genre
  movie_data["PG"] = pg
  movie_data["Summary"] = syn
  movie_data["Summary_short"] = tsum(syn)
  movie_data["Rating"] = float(rank)
  movie_data["RelateMovie"] = rMovieList

  if hasReview and (pg != '청소년 관람불가') and (pg != '제한상영가') and (int(float(rank)) != 0):
    movie_data["vaildReview"] = hasReview
  else:
     movie_data["vaildReview"] = False

  os.makedirs('./movie_data/', exist_ok=True)
  #print("Creating the json file...")
  with open(filepath, 'w', encoding="utf-8") as make_file:
    json.dump(movie_data, make_file, ensure_ascii=False, indent="\t")

  with open(filepath, 'r', encoding="utf-8") as f:
    json_data = json.load(f)

  return json_data



# Merge movie data
def readMoviedata(movieLst):
  movieData = []
  for mid in tqdm(movieLst):
    d = getMoviedata(mid)
    movieData.append(d)
  print()
  return movieData



# Collect Movie Data by year
def collectMovieByYear(yearLst):
  movieID = []

  filepath = './movie_list/movieIdList.csv'
  if os.path.isfile(filepath) and loadMovieData:
    with open(filepath, 'r', encoding='utf-8') as csv_file:
      csv_reader = reader(csv_file)
      for row in csv_reader:
        movieID += (row)
        return movieID

  for year in tqdm(yearLst):
    baseURL = nmYearURL + str(year)
    for page in range(1, maxYearPage):
      URL = baseURL + nmPage + str(page)
      yearList = requests.get(URL)
      soup = BeautifulSoup(yearList.text, 'html.parser')

      selector = '#old_content > ul > li'
      linkList = soup.select(selector)

      for link in linkList:
        movieID.append(link.find('a')['href'])
  print()

  for link in movieID:
    ID = link.replace(r'/movie/bi/mi/basic.naver?code=', '')
    movieID[movieID.index(link)] = int(ID)

  movieID = set(movieID)
  movieID = list(movieID)

  for ID in movieID[:]:
    if ID < (minIndex+1):
      movieID.remove(ID)

  movieID.sort()

  listPath = './movie_list/'
  os.makedirs(listPath, exist_ok=True)
  with open(listPath+"movieIdList.csv", 'w') as file: 
    writer = csv.writer(file) 
    writer.writerow(movieID)
  return movieID

In [None]:
# Collect Movie Reviews from mid
def getReviewByID(mid):
  os.makedirs('./review_data/', exist_ok=True)
  filepath = './review_data/'+str(mid)+'.json'
  if os.path.isfile(filepath) and loadMovieData:
    #print("Already exist json : ", filepath)
    with open(filepath, 'r') as f:
      json_data = json.load(f)
      return json_data

  # Metadata
  reviewURL = nmReviewURL + str(mid) + nmReviewURL2 + nmPage
  firstURL = reviewURL + str(1)
  reviewNum = -1
  rPage = 1

  # Infomations
  reviewText = []
  date = []
  rating = []
  author = []
  review_id = []
  movie_id = mid

  while rPage != 0:
    mentMax = 0
    URL = reviewURL + str(rPage)
    rPage += 1
    reviewPage = requests.get(URL)
    soup = BeautifulSoup(reviewPage.text, 'html.parser')

    if reviewNum == -1:
      selector = 'body > div > div > div.score_total > strong > em'
      if(len(soup.select(selector)) > 0):
        reviewNum = soup.select(selector)[0].text
        reviewNum = reviewNum.replace(",","")
        reviewNum = int(reviewNum)
      else:
        reviewNum = 0

    if reviewNum < 10 or rPage == 21:
      rPage = 0
      mentMax = reviewNum % 10
    else:
      reviewNum = reviewNum - 10
      mentMax = 10

    # Retrive Data
    for mentNum in range(0, mentMax):
      # Review Text
      selector = f'#_filtered_ment_{mentNum}'
      if(len(soup.select(selector)) > 0):
        text = soup.select(selector)[0].text
        text = text.strip()
      else:
        text = "NaN"

      # Rating
      selector = f'body > div > div > div.score_result > ul > li:nth-of-type({mentNum+1}) > div.star_score > em'
      if(len(soup.select(selector)) > 0):
        score = soup.select(selector)[0].text
      else:
        score = "NaN"

      # Author
      selector = f'body > div > div > div.score_result > ul > li:nth-of-type({mentNum+1}) > div.score_reple > dl > dt > em:nth-of-type(1) > a > span'
      if(len(soup.select(selector)) > 0):
        nid = soup.select(selector)[0].text
        nid = re.sub('(^\S*\()|(\)$)',"", nid)
      else:
        nid = "NaN"

      # Date
      selector = f'body > div > div > div.score_result > ul > li:nth-of-type({mentNum+1}) > div.score_reple > dl > dt > em:nth-of-type(2)'
      if(len(soup.select(selector)) > 0):
        rDate = soup.select(selector)[0].text
        rDate = re.sub('(^\d\d)|(\s\d\d:\d\d$)',"", rDate)
      else:
        rDate = "NaN"
      
      # review_id
      selector = f'body > div > div > div.score_result > ul > li:nth-of-type({mentNum+1}) > div.score_reple > dl > dt > em:nth-of-type(1) > a'
      if(len(soup.select(selector)) > 0):
        rid = soup.select(selector)[0]['onclick']
        rid = re.sub('\D',"", rid)
      else:
        rid = "NaN"

      reviewText.append(text)
      rating.append(score)
      author.append(nid)
      date.append(rDate)
      review_id.append(rid)

  reviews = []

  for Num in range(0, len(reviewText)):
    r = OrderedDict()
    r["review"] = reviewText[Num]
    r["date"] = date[Num]
    r["rating"] = rating[Num]
    r["author"] = author[Num]
    r["review_id"] = review_id[Num]
    r["movie_id"] = str(movie_id)
    reviews.append(r)

  #print("Creating the json file...")
  with open(filepath, 'w', encoding="utf-8") as make_file:
    json.dump(reviews, make_file, ensure_ascii=False, indent="\t")

  return reviews

def getReviews(movieData):
  reviews = []
  for movie in tqdm(movieData):
    review = getReviewByID(movie['MovieID'])
    reviews.append(review)

  return reviews

In [None]:
if __name__=="__main__":

  if debugMode:
    movieList = collectMovieByYear(yearList[:yRange])
  else:
    movieList = collectMovieByYear(yearList)

  # For Debug
  reduceList = len(movieList)
  if debugMode:
    reduceList = 5
    movieList = movieList[:reduceList]
  
  movieData = readMoviedata(movieList)
  reviewData = getReviews(movieData)

  # Test Code
  if debugMode:
    pp.pprint(movieData[0])
    pp.pprint(reviewData[0])

  # Save current Movie data to .zip file.
  if importZip:
    shutil.make_archive('./movie_data', 'zip', './movie_data/')
    shutil.make_archive('./review_data', 'zip', './review_data/')

