# 변경 X

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
import time

In [2]:
BASE_URL = "https://movie.naver.com"
SEARCH = "/movie/search/result.naver?query="
POINT_SEARCH = "/movie/bi/mi/point.naver?code="

In [3]:
movie_df = pd.DataFrame(columns=['code', 'korean_title', 'english_title', 'genre_list', 'country', 'released_date', 'director', 'actors'])

In [4]:
def html_parser(url):
    req = requests.get(url)
    return req.text

In [5]:
def html_selector(html, query):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.select(query)

In [6]:
def get_point_url(code):
    return html_parser(BASE_URL + POINT_SEARCH + str(code))

# 영화 정보 크롤링

code, 한국 제목, 장르, 국가, 개봉일(null값있음), director, actors(의미없는 값 있음)

개봉일과 actors는 1.5만 데이터에서도 확인할 수 있으므로 더 이상 디테일은 잡지 않았습니다.

ENG_TITLE_LIST에 영화 제목들을 담아주시면 되는데, 띄어쓰기는 모두 replace 또는 strip으로 없애주세요!

대소문자는 상관없습니다.

In [7]:
def get_movie_info(search_title):
#     print(search_title)
    try:
        search_html = html_parser(BASE_URL + SEARCH + search_title)
        search_result = html_selector(search_html, '.search_list_1 > li')
        # 가장 첫 번째로 나온 결과만을 가져감
        result = str(search_result[0].select('dl > dt > a'))
    except:
        return 'error'

    movie_url = result.split("href=")[1].split(">")[0].replace('"', '')
    code = movie_url.split('=')[1]

    basic_html = html_parser(BASE_URL + movie_url)

    search_result = html_selector(basic_html, 'h3.h_movie > a')
    title = str(search_result[0]).split('>')[1].split('<')[0]

    movie_result = html_selector(basic_html, 'dl.info_spec > dd')
    temp_summary = movie_result[0].select('a')
    director = movie_result[1].text
    actors = movie_result[2].text.split(',')
    
    genre_list = list()
    nation_list = list()
    released = list()
    for summary in temp_summary:
        c = summary.attrs['href'].split(".naver?")[1].split('=')[0]

        if c == 'genre':
            genre_list.append(summary.text)
        elif c == 'nation':
            nation_list.append(summary.text)
        elif c == 'open':
            released.append(summary.text)
        else:
            print(c)
    
    return {'code': code, 'korean_title': title, 'english_title': search_title, 'genre_list': genre_list, 'country': nation_list, 'released_date': ''.join(released), 'director': director, 'actors': actors}

In [9]:
# 예시
ENG_TITLE_LIST = ['kingsman', 'spiderman', 'HOWTOBUILDAGIRL', 'Centigrade', 'LetsFightGhost', 'Moxie']

In [10]:
'''
print로 나온 친구들은 크롤링에 실패한 아이들입니다. 무슨 이유인지 간혹 있더라구요.
목록 다시 넘겨주시면 이유를 제가 다시 한 번 확인해보겠습니다!
'''
for eng_title in ENG_TITLE_LIST:
    result = get_movie_info(eng_title)
    if result == 'error':
        print(eng_title)
    else:
        movie_df = movie_df.append(result, ignore_index=True)

LetsFightGhost


In [11]:
movie_df

Unnamed: 0,code,korean_title,english_title,genre_list,country,released_date,director,actors
0,159893,킹스맨: 퍼스트 에이전트,kingsman,[액션],"[영국, 미국]",2021.12.22,매튜 본,"[랄프 파인즈(옥스포드 공작), 해리스 딕킨슨(콘래드)더보기]"
1,31307,스파이더맨,spiderman,"[액션, SF, 모험, 스릴러]",[미국],2002.05.03,샘 레이미,"[토비 맥과이어(스파이더맨/피터 파커), 윌렘 대포(그린 고블린/노먼 오스본), ..."
2,188910,하우 투 빌드 어 걸,HOWTOBUILDAGIRL,[코미디],[영국],,코키 지드로익,"[비니 펠드스타인, 패디 콘시딘, 사라 솔매니더보기]"
3,201848,센티그레이드,Centigrade,"[스릴러, 드라마]",[캐나다],,브렌던 월쉬,[\n\r\n\t\t\t\t\t\t[국내] \r\n\t\t\t\t\t\t\r\n\t...
4,203096,걸스 오브 막시,Moxie,[드라마],[미국],,에이미 포엘러,"[조세핀 랭포드, 패트릭 슈왈제네거더보기]"


# 네티즌 평점 크롤링

movie_df에서 가져온 code를 기준으로 다시 평점 데이터를 받아옴

아직 미개봉 영화에 대해서는 모두 NaN으로 처리

`code`: 영화코드(str)

`num_of_netizen`: 평가에 참여한 네티즌의 수(str)

`key_review`: 한 줄 리뷰(str)

`score_male`: 남자 평점(str)

`score_female`: 여자 평점(str)

`scores_per_age`: 나이별 평점(float list) (10대, 20대, 30대, 40대, 50대 이상 순)

In [12]:
def get_review_info(code, classification):
    
    idx = 1 if classification == 'audience' else 0
    point_url = get_point_url(code)
    
    try:
        elements = html_selector(point_url, f'.grade_{classification} > .title_area.grade_tit > .sc_area > .star_score > .st_off > .st_on')
        score = elements[0].attrs['style'].split(':')[1]
    except:
        return 'error'

    num = html_selector(point_url, f'.grade_{classification} > .title_area.grade_tit > .sc_area > .user_count > em')[0].text
    
    
    # key_review가 없는 경우 index out of range 에러가 뜨길래 수정했습니다.
    try:
        key_review = html_selector(point_url, '.grp_review')[0].text
    except:
        key_review = None
        
    male = html_selector(point_url, '.grp_gender > .grp_box > .grp_male > .graph_point')[idx].text
    female = html_selector(point_url, '.grp_gender > .grp_box > .grp_female > .graph_point')[idx].text
    score_list = [float(element.text) for element in html_selector(point_url, '.grp_age > .grp_box > .graph_point')[idx*5:(idx+1)*5]]
    
    result = {'code': code, f'num_of_{classification}': num, 'key_review': key_review, 'score_male': male, 'score_female': female, 'scores_per_age': score_list}
    
    if classification == 'audience':
        num_per_ages = [element.attrs['style'].split(': ')[1] for element in html_selector(point_url, '.bar_graph > .graph_box > .graph > span')[:5]]
        result['num_per_ages'] = num_per_ages

    return result

In [13]:
netizen_df = movie_df[['code', 'korean_title']].copy()
netizen_df

Unnamed: 0,code,korean_title
0,159893,킹스맨: 퍼스트 에이전트
1,31307,스파이더맨
2,188910,하우 투 빌드 어 걸
3,201848,센티그레이드
4,203096,걸스 오브 막시


In [14]:
data = netizen_df['code'].apply(lambda x: get_review_info(x, 'netizen') if get_review_info(x, 'netizen') != 'error' else 'error').tolist()
data = list(filter(lambda x: x != 'error', data))
temp_df = pd.DataFrame(data, columns=['code', 'num_of_netizen', 'key_review', 'score_male', 'score_female', 'scores_per_age'])
netizen_df = pd.merge(netizen_df, temp_df, how='outer', on='code')
netizen_df

Unnamed: 0,code,korean_title,num_of_netizen,key_review,score_male,score_female,scores_per_age
0,159893,킹스맨: 퍼스트 에이전트,2928.0,이 영화는 30대 남성이 좋아하는 연기가 뛰어난 영화입니다.,8.06,8.69,"[8.25, 8.16, 8.4, 8.1, 8.73]"
1,31307,스파이더맨,1754.0,이 영화는 10대 남성이 좋아하는 스토리가 뛰어난 영화입니다.,8.96,9.03,"[9.04, 8.93, 9.1, 8.78, 8.73]"
2,188910,하우 투 빌드 어 걸,,,,,
3,201848,센티그레이드,,,,,
4,203096,걸스 오브 막시,140.0,이 영화는 20대 여성이 좋아하는 스토리가 뛰어난 영화입니다.,6.12,9.86,"[9.76, 9.83, 8.92, 9.38, 10.0]"


# 관람객 평점

네티즌 평점과 동일하고, 추가사항이 있음

`num_per_ages`: 나이대 별 관람객 수 비율(10대, 20대, 30대, 40대, 50대 이상 순)

`num_per_genders`: 성별 별 관람객 수 비율 (남자, 여자 순)

In [15]:
audience_df = movie_df[['code', 'korean_title']].copy()
audience_df

Unnamed: 0,code,korean_title
0,159893,킹스맨: 퍼스트 에이전트
1,31307,스파이더맨
2,188910,하우 투 빌드 어 걸
3,201848,센티그레이드
4,203096,걸스 오브 막시


In [16]:
data = audience_df['code'].apply(lambda x: get_review_info(x, 'audience') if get_review_info(x, 'audience') != 'error' else 'error').tolist()
data = list(filter(lambda x: x != 'error', data))
temp_df = pd.DataFrame(data, columns=['code', 'num_of_audience', 'key_review', 'score_male', 'score_female', 'scores_per_age', 'num_per_ages'])
audience_df = pd.merge(audience_df, temp_df, how='outer', on='code')
audience_df

Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_ages
0,159893,킹스맨: 퍼스트 에이전트,675.0,이 영화는 30대 남성이 좋아하는 연기가 뛰어난 영화입니다.,7.43,7.64,"[9.0, 7.71, 7.29, 7.47, 7.42]","[1%, 40%, 36%, 16%, 6%]"
1,31307,스파이더맨,,,,,,
2,188910,하우 투 빌드 어 걸,,,,,,
3,201848,센티그레이드,,,,,,
4,203096,걸스 오브 막시,,,,,,


In [40]:
point_url = get_point_url(208077)
is_graph = html_selector(point_url, '.viewing_graph')
if not is_graph:
    pass
else:
    percent_list = html_selector(point_url, '.viewing_graph > .graph_wrap > .bar_graph > .graph_box > .graph_percent')
    print(list(map(lambda x: int(x.text.replace('%', '')), percent_list))[:5])

[5, 45, 35, 12, 3]


In [85]:
code_list = audience_df['code'].tolist()

In [66]:
code_list = [130267, 159146, 145376, 160720, 156930, 159785, 72058, 35901, 109905, 74027, 181030, 144271, 129012, 103331, 124212, 67769, 130041, 136007, 118917, 109955, 193839, 104478, 127865, 130983, 129331, 129049, 122469, 192121, 106557, 129333, 95541, 172819, 117802, 122596, 118980, 90596, 15899, 127335, 69956, 107352, 118950, 19303, 18781, 103708, 32686, 101961, 94187, 73318, 112039, 101236, 117790, 105521, 88426, 122535, 120123, 93739, 108225, 112079, 93028, 99096, 61101, 109778, 87309, 114249, 190727, 122473, 127321, 146212, 109982, 195970, 125841, 45290, 118909, 72524, 178654, 164115, 113377, 109095, 16792, 149248, 109169, 100676, 94829, 137908, 113348, 118397, 99740, 191929, 92075, 52555, 90589, 66158, 118367, 12787, 125438, 137327, 155411, 160749, 162824, 125494, 97631, 72054, 18878, 91192, 103765, 97858, 97612, 189150, 100691, 103324, 95203, 78726, 124290, 86335, 32667, 10546, 11031, 119457, 51777, 25670, 98279, 95334, 107923, 102648, 119412, 12918, 31565, 76020, 66464, 78845, 31940, 47321, 96031, 64686, 89757, 74610, 52515, 169643, 150688, 39809, 66091, 82283, 62586, 37439, 85141, 104426, 97692, 123630, 17159, 109193, 132626, 10002, 59075, 10003, 80219, 118966, 92069, 142201, 76651, 100643, 17173, 37937, 49336, 87556, 86843, 52553, 34225, 38227]

In [86]:
len(code_list)

1824

In [93]:
audience_genders = {}

driver = webdriver.Chrome(executable_path='./chromedriver')

for code in code_list[1235:]:
    driver.get(BASE_URL + POINT_SEARCH + str(code))
    time.sleep(1)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    temp_genders = bs.find_all('tspan')
    if len(temp_genders) == 0:
        num_per_genders = np.nan
    else:
        num_per_genders = [element.text for element in temp_genders[:2]]
    audience_genders[code] = num_per_genders

In [94]:
audience_genders

{72058: ['12%', '88%'],
 10440: ['32%', '27%'],
 35901: ['65%', '35%'],
 150829: nan,
 152700: nan,
 109905: ['58%', '42%'],
 38420: ['27%', '21%'],
 31283: ['21%', '26%'],
 49857: ['34%', '20%'],
 137982: ['28%', '28%'],
 74027: ['52%', '48%'],
 40098: ['20%', '46%'],
 181030: ['58%', '42%'],
 19066: ['22%', '41%'],
 91107: nan,
 146184: nan,
 134584: nan,
 142370: nan,
 144271: ['67%', '33%'],
 119720: nan,
 68535: nan,
 83001: nan,
 80866: ['22%', '35%'],
 158347: ['0%', '0%'],
 154491: nan,
 61865: ['7%', '45%'],
 37758: ['29%', '17%'],
 129012: ['48%', '52%'],
 103331: ['30%', '70%'],
 124212: ['55%', '45%'],
 171549: nan,
 67769: ['58%', '42%'],
 99794: ['26%', '35%'],
 87232: ['24%', '37%'],
 73096: ['17%', '40%'],
 153343: nan,
 129824: nan,
 151867: nan,
 130041: ['29%', '71%'],
 152696: nan,
 136007: ['45%', '55%'],
 144141: nan,
 149382: nan,
 118917: ['46%', '54%'],
 154195: nan,
 103271: ['26%', '30%'],
 133443: nan,
 109955: ['46%', '54%'],
 10392: ['36%', '36%'],
 31683:

In [92]:
# crawled = audience_genders
crawled

{203096: ['23%', '19%'],
 17997: ['20%', '23%'],
 181925: ['58%', '42%'],
 195979: nan,
 138113: nan,
 197517: ['100%', '21%'],
 155356: ['36%', '64%'],
 168668: ['44%', '56%'],
 156023: nan,
 151155: ['25%', '22%'],
 143776: ['38%', '62%'],
 168735: ['22%', '78%'],
 142683: ['25%', '75%'],
 199742: nan,
 167675: ['100%', '24%'],
 151241: ['61%', '39%'],
 154667: ['36%', '64%'],
 164269: ['38%', '62%'],
 159366: ['26%', '74%'],
 162874: ['28%', '72%'],
 158180: ['45%', '55%'],
 167638: ['42%', '58%'],
 180209: ['37%', '63%'],
 161834: ['50%', '50%'],
 172454: ['54%', '46%'],
 158112: ['43%', '57%'],
 167651: ['45%', '55%'],
 164192: ['55%', '45%'],
 171744: ['100%', '0%'],
 143422: ['25%', '75%'],
 130811: ['100%', '9%'],
 195980: nan,
 168749: ['41%', '59%'],
 142210: ['33%', '67%'],
 159892: ['44%', '56%'],
 164150: ['100%', '19%'],
 158191: ['50%', '50%'],
 194581: nan,
 142255: ['44%', '56%'],
 199262: nan,
 194416: ['40%', '20%'],
 190313: nan,
 179338: ['100%', '30%'],
 190244: [

In [96]:
for k, v in audience_genders.items():
    crawled[k] = v
crawled

{203096: ['23%', '19%'],
 17997: ['20%', '23%'],
 181925: ['58%', '42%'],
 195979: nan,
 138113: nan,
 197517: ['100%', '21%'],
 155356: ['36%', '64%'],
 168668: ['44%', '56%'],
 156023: nan,
 151155: ['25%', '22%'],
 143776: ['38%', '62%'],
 168735: ['22%', '78%'],
 142683: ['25%', '75%'],
 199742: nan,
 167675: ['100%', '24%'],
 151241: ['61%', '39%'],
 154667: ['36%', '64%'],
 164269: ['38%', '62%'],
 159366: ['26%', '74%'],
 162874: ['28%', '72%'],
 158180: ['45%', '55%'],
 167638: ['42%', '58%'],
 180209: ['37%', '63%'],
 161834: ['50%', '50%'],
 172454: ['54%', '46%'],
 158112: ['43%', '57%'],
 167651: ['45%', '55%'],
 164192: ['55%', '45%'],
 171744: ['100%', '0%'],
 143422: ['25%', '75%'],
 130811: ['100%', '9%'],
 195980: nan,
 168749: ['41%', '59%'],
 142210: ['33%', '67%'],
 159892: ['44%', '56%'],
 164150: ['100%', '19%'],
 158191: ['50%', '50%'],
 194581: nan,
 142255: ['44%', '56%'],
 199262: nan,
 194416: ['40%', '20%'],
 190313: nan,
 179338: ['100%', '30%'],
 190244: [

In [99]:
crawled[312662] = np.nan

In [100]:
len(crawled)

1824

In [90]:
code_list[1234]

312662

In [91]:
audience_df[audience_df.code == 312662]

Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_genders
1234,312662,강대호,,,,,,


In [73]:
code = 159146
driver.get(BASE_URL + POINT_SEARCH + str(code))
time.sleep(1)
html = driver.page_source
bs = BeautifulSoup(html, 'html.parser')
temp_genders = bs.find_all('tspan')

In [74]:
temp_genders

[<tspan dy="4.666665842868824" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">38%</tspan>,
 <tspan dy="4.666664311550107" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">62%</tspan>]

In [104]:
audience_df = pd.read_csv('../../data/crawling_audience_review.csv')
audience_df

Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_genders
0,203096,걸스 오브 막시,,,,,,
1,17997,첨밀밀,,,,,,
2,181925,클로젯,387,이 영화는 20대 여성이 좋아하는 연기가 뛰어난 영화입니다.,7.93,8.41,"[8.75, 7.75, 8.17, 8.79, 8.63]","['58%', '42%']"
3,195979,비밀경찰: 랍스터 캅,,,,,,
4,138113,매직 아워,,,,,,
...,...,...,...,...,...,...,...,...
1819,72043,쓰리 데이즈,,,,,,
1820,77566,브레이킹 던 part2,,,,,,
1821,38227,첫 키스만 50번째,177,이 영화는 20대 남성이 좋아하는 스토리가 뛰어난 영화입니다.,9.35,9.08,"[9.78, 9.08, 9.7, 9.0, 9.12]",
1822,74567,브레이킹 던 part1,,,,,,


In [52]:
code_list = audience_df.code.tolist()

In [54]:
len(code_list)

1824

In [60]:
audience_genders = {}

driver = webdriver.Chrome(executable_path='./chromedriver')

for code in code_list[:10]:
    driver.get(BASE_URL + POINT_SEARCH + str(code))
    time.sleep(1)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    temp_genders = bs.find_all('tspan')
    if len(temp_genders) <= 5:
        print(code)
        num_per_genders = np.nan
    else:
        num_per_genders = [element.text for element in temp_genders[:2]]
    audience_genders[code] = num_per_genders

203096
17997
195979
138113
156023
151155


In [82]:
audience_genders

{130267: ['100%'],
 159146: ['38%', '62%'],
 145376: ['28%', '72%'],
 160720: ['18%', '82%'],
 156930: ['50%', '50%'],
 159785: ['67%', '33%'],
 72058: ['12%', '88%'],
 35901: ['65%', '35%'],
 109905: ['58%', '42%'],
 74027: ['52%', '48%'],
 181030: ['58%', '42%'],
 144271: ['67%', '33%'],
 129012: ['48%', '52%'],
 103331: ['30%', '70%'],
 124212: ['55%', '45%'],
 67769: ['58%', '42%'],
 130041: ['29%', '71%'],
 136007: ['45%', '55%'],
 118917: ['46%', '54%'],
 109955: ['46%', '54%'],
 193839: ['63%', '37%'],
 104478: ['60%', '40%'],
 127865: ['54%', '46%'],
 130983: ['36%', '64%'],
 129331: ['42%', '58%'],
 129049: ['56%', '44%'],
 122469: ['66%', '34%'],
 192121: ['50%', '50%'],
 106557: ['34%', '66%'],
 129333: ['59%', '41%'],
 95541: ['50%', '50%'],
 172819: ['30%', '70%'],
 117802: ['60%', '40%'],
 122596: ['22%', '78%'],
 118980: ['33%', '67%'],
 90596: ['58%', '42%'],
 15899: ['36%', '64%'],
 127335: ['73%', '27%'],
 69956: ['64%', '36%'],
 107352: ['44%', '56%'],
 118950: ['51%

In [83]:
audience_df

Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_genders
0,203096,걸스 오브 막시,,,,,,
1,17997,첨밀밀,,,,,,
2,181925,클로젯,387,이 영화는 20대 여성이 좋아하는 연기가 뛰어난 영화입니다.,7.93,8.41,"[8.75, 7.75, 8.17, 8.79, 8.63]","['58%', '42%']"
3,195979,비밀경찰: 랍스터 캅,,,,,,
4,138113,매직 아워,,,,,,
...,...,...,...,...,...,...,...,...
1819,72043,쓰리 데이즈,,,,,,
1820,77566,브레이킹 던 part2,,,,,,
1821,38227,첫 키스만 50번째,177,이 영화는 20대 남성이 좋아하는 스토리가 뛰어난 영화입니다.,9.35,9.08,"[9.78, 9.08, 9.7, 9.0, 9.12]",
1822,74567,브레이킹 던 part1,,,,,,


In [102]:
temp_df

Unnamed: 0,code,num_per_genders
0,72058,
1,10440,
2,35901,
3,150829,
4,152700,
...,...,...
584,72043,
585,77566,
586,38227,
587,74567,


In [110]:
temp_df = pd.DataFrame(columns=['code', 'num_per_genders'])
temp_df['code'] = crawled.keys()
temp_df['num_per_genders'] = crawled.values()

audience_df = pd.merge(audience_df, temp_df, how='outer', on='code')
audience_df

  return array(a, dtype, copy=False, order=order)


Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_genders
0,203096,걸스 오브 막시,,,,,,"[23%, 19%]"
1,17997,첨밀밀,,,,,,"[20%, 23%]"
2,181925,클로젯,387,이 영화는 20대 여성이 좋아하는 연기가 뛰어난 영화입니다.,7.93,8.41,"[8.75, 7.75, 8.17, 8.79, 8.63]","[58%, 42%]"
3,195979,비밀경찰: 랍스터 캅,,,,,,
4,138113,매직 아워,,,,,,
...,...,...,...,...,...,...,...,...
1819,72043,쓰리 데이즈,,,,,,"[27%, 30%]"
1820,77566,브레이킹 던 part2,,,,,,"[20%, 18%]"
1821,38227,첫 키스만 50번째,177,이 영화는 20대 남성이 좋아하는 스토리가 뛰어난 영화입니다.,9.35,9.08,"[9.78, 9.08, 9.7, 9.0, 9.12]","[48%, 52%]"
1822,74567,브레이킹 던 part1,,,,,,"[18%, 22%]"


In [112]:
audience_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1824 entries, 0 to 1823
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   code             1824 non-null   int64  
 1   korean_title     1824 non-null   object 
 2   num_of_audience  594 non-null    object 
 3   key_review       594 non-null    object 
 4   score_male       594 non-null    float64
 5   score_female     594 non-null    float64
 6   scores_per_age   594 non-null    object 
 7   num_per_genders  1198 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 128.2+ KB


In [116]:
audience_df.to_csv('./crawled.csv', index=False)

In [117]:
df = pd.read_csv('./crawled.csv')

In [119]:
df.tail(30)

Unnamed: 0,code,korean_title,num_of_audience,key_review,score_male,score_female,scores_per_age,num_per_genders
1794,29102,에린 브로코비치,,,,,,"['20%', '36%']"
1795,46752,스텝 브라더스,,,,,,
1796,11300,비틀쥬스,,,,,,
1797,26677,경찰서를 털어라,,,,,,"['20%', '35%']"
1798,46753,하우스 버니,,,,,,
1799,87556,올드보이,16.0,이 영화는 30대 남성이 좋아하는 연기가 뛰어난 영화입니다.,6.82,5.0,"[0.0, 7.0, 5.67, 0.0, 0.0]","['69%', '31%']"
1800,30786,브링 잇 온,,,,,,"['18%', '20%']"
1801,74451,배드 티처,,,,,,"['14%', '59%']"
1802,82014,S.W.A.T.: 파이어 파이트,,,,,,
1803,40211,영화 이누야샤: 시대를 넘어선 마음,,,,,,


# 연습 공간

In [43]:
point_url = get_point_url(130267)
html_selector(point_url, '')

TypeError: html_selector() missing 1 required positional argument: 'query'

In [278]:
point_url = get_point_url(CODE)
netizen_elements = html_selector(point_url, '.grade_audience > .title_area.grade_tit > .sc_area > .star_score > .st_off > .st_on')
score = netizen_elements[0].attrs['style'].split(':')[1]
num_netizen = html_selector(point_url, '.grade_audience > .title_area.grade_tit > .sc_area > .user_count > em')[0].text

In [281]:
netizen_male = html_selector(point_url, '.grp_gender > .grp_box > .grp_male > .graph_point')[1].text
netizen_female = html_selector(point_url, '.grp_gender > .grp_box > .grp_female > .graph_point')[1].text

In [291]:
score_list = [float(element.text) for element in html_selector(point_url, '.grp_age > .grp_box > .graph_point')[5:]]

In [288]:
def get_netizen_info(code):
    try:
        point_url = get_point_url(code)
        netizen_elements = html_selector(point_url, '.grade_netizen > .title_area.grade_tit > .sc_area > .star_score > .st_off > .st_on')
        score = netizen_elements[0].attrs['style'].split(':')[1]
    except:
        return 'error'

    num_netizen = html_selector(point_url, '.grade_netizen > .title_area.grade_tit > .sc_area > .user_count > em')[0].text
    key_review = html_selector(point_url, '.grp_review')[0].text
    netizen_male = html_selector(point_url, '.grp_gender > .grp_box > .grp_male > .graph_point')[0].text
    netizen_female = html_selector(point_url, '.grp_gender > .grp_box > .grp_female > .graph_point')[0].text
    score_list = [float(element.text) for element in html_selector(point_url, '.grp_age > .grp_box > .graph_point')[:5]]
    
    return {'code': code, 'num_of_netizen': num_netizen, 'key_review': key_review, 'score_male': netizen_male, 'score_female': netizen_female, 'scores_per_age': score_list}

In [66]:
point_url = html_parser(BASE_URL + POINT_SEARCH + str(31307))
netizen_elements = html_selector(point_url, '.grade_netizen > .title_area.grade_tit > .sc_area > .star_score > .st_off > .st_on')
# netizen_score = html_selector(netizen_elements, '.star_score > .st_off > .st_on')
score = netizen_elements[0].attrs['style'].split(':')[1]
num_netizen = html_selector(point_url, '.grade_netizen > .title_area.grade_tit > .sc_area > .user_count > em')[0].text

'1,747'

In [70]:
key_review = html_selector(point_url, '.grp_review')[0].text

In [84]:
netizen_male = html_selector(point_url, '.grp_gender > .grp_box > .grp_male > .graph_point')[0].text
netizen_female = html_selector(point_url, '.grp_gender > .grp_box > .grp_female > .graph_point')[0].text

In [88]:
score_list = [float(element.text) for element in html_selector(point_url, '.grp_age > .grp_box > .graph_point')]
score_list

[9.04, 8.93, 9.07, 8.78, 8.73]

In [18]:
# example: google search
# driver.get("http://www.google.com")
# input_element = driver.find_element_by_name("q")
# input_element.send_keys("python")
# input_element.submit()

In [42]:
search_title = 'kingsman'
search_html = html_parser(BASE_URL + SEARCH + search_title)
search_result = html_selector(search_html, '.search_list_1 > li')
result = str(search_result[0].select('dl > dt > a'))
movie_url = result.split("href=")[1].split(">")[0].replace('"', '')
code = movie_url.split('=')[1]

basic_html = html_parser(BASE_URL + movie_url)

search_result = html_selector(basic_html, 'h3.h_movie > a')
title = str(search_result[0]).split('>')[1].split('<')[0]

movie_result = html_selector(basic_html, 'dl.info_spec > dd')
temp_summary = movie_result[0].select('a')
director = movie_result[1].text
actors = movie_result[2].text.split(',')

In [43]:
print(code, title, temp_summary, director, actors)

159893 킹스맨: 퍼스트 에이전트 [<a href="/movie/sdb/browsing/bmovie.naver?genre=19">액션</a>, <a href="/movie/sdb/browsing/bmovie.naver?nation=GB">영국</a>, <a href="/movie/sdb/browsing/bmovie.naver?nation=US">미국</a>, <a href="/movie/sdb/browsing/bmovie.naver?open=2021"> 2021</a>, <a href="/movie/sdb/browsing/bmovie.naver?open=20211222">.12.22</a>] 매튜 본 ['랄프 파인즈(옥스포드 공작)', ' 해리스 딕킨슨(콘래드)더보기']


In [37]:
genre = list()
nation = list()
released = list()

In [45]:
for summary in temp_summary:
    c = summary.attrs['href'].split(".naver?")[1].split('=')[0]
    if c == 'genre':
        genre.append(summary.text)
    elif c == 'nation':
        nation.append(summary.text)
    elif c == 'open':
        released.append(summary.text)
    else:
        print(c)

In [46]:
genre, nation, released

(['액션', '모험', 'SF', '액션', '액션'],
 ['미국', '영국', '미국', '영국', '미국'],
 [' 2021', '.12.15', ' 2021', '.12.22', ' 2021', '.12.22'])

In [57]:
''.join(released)

' 2021.12.15 2021.12.22 2021.12.22'

In [41]:
movie_df

Unnamed: 0,code,korean_title,english_title,genre_list,country,runtime,released_date,director,actors


In [68]:
BASE_URL = "https://movie.naver.com/movie/search/result.naver?query=스파이더맨"
req = requests.get(BASE_URL)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# CSS Selector를 통해 html요소들을 찾아낸다.
search_result_list = soup.select(
    '.search_list_1 > li'
    )
temp = str(search_result_list[0].select('dl > dt > a'))

In [79]:
temp.split("href=")[1].split(">")[0].replace('"', '')

'/movie/bi/mi/basic.naver?code=208077'

In [None]:
def search_naver_movie(title):
    req = requests.get(BASE_URL + SEARCH + title)
    html = req.text
    soup = BeautifulSoup(html, 'html.parser')
    search_result_list = soup.select('.search_list_1 > li')
    result = str(search_result_list[0].select('dl > dt > a'))
    movie_url = result.split("href=")[1].split(">")[0].replace('"', '')
    
    req = requests.get(BASE_URL + movie_url)
    html = req.text

In [97]:
# movie_data_set = {'code': 0, 'title': '', 'genre_list': [], 'country': [], 'runtime': '', 'released_date': '', 'director': '', 'actors': []}

In [125]:
movie_df # -> korean title, english title

Unnamed: 0,code,korean_title,english_title,genre_list,country,runtime,released_date,director,actors


In [127]:
netizen_df

Unnamed: 0,code,korean_title,num_of_netizen,key-review,score_male,score_female,scores_per_age


In [128]:
audience_df # -> 영민님이 말씀하신 부분 추가

Unnamed: 0,code,korean_title,num_of_netizen,key-review,score_male,score_female,scores_per_age


In [174]:
temp_summary[0]

<a href="/movie/sdb/browsing/bmovie.naver?genre=19">액션</a>

In [162]:
list(map(lambda x: x.text, list(temp_summary)))

['액션', '모험', 'SF', '미국', ' 2021', '.12.15']

In [59]:
BASE_URL = "https://movie.naver.com/movie/search/result.naver?query=트랜스포머"
req = requests.get(BASE_URL)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# CSS Selector를 통해 html요소들을 찾아낸다.
my_titles = soup.select(
    '.search_list_1 > li'
    )
my_titles[0].select('dl > dt > a')

[<a href="/movie/bi/mi/basic.naver?code=123630"><strong>트랜스포머</strong>: 최후의 기사 (Transformers: The Last Knight)</a>]

In [60]:
BASE_URL = "https://movie.naver.com/movie/search/result.naver?query=시월애"
req = requests.get(BASE_URL)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# CSS Selector를 통해 html요소들을 찾아낸다.
my_titles = soup.select(
    '.search_list_1 > li'
    )
my_titles[0].select('dl > dt > a')

[<a href="/movie/bi/mi/basic.naver?code=30306"><strong>시월애</strong> (時越愛)</a>]

In [35]:
driver = webdriver.Chrome(executable_path='./chromedriver')
driver.get(BASE_URL)
input_element = driver.find_element_by_class_name("search_list_1")
print(input_element.text)

스파이더맨: 노 웨이 홈 (Spider-Man: No Way Home)
9.02 (참여 14650명)
액션, 모험, SF| 미국|148분 |2021
감독 : 존 왓츠|출연 : 톰 홀랜드, 젠데이아 콜먼, 베네딕트 컴버배치, 존 파브로, 제이콥 배덜런, 마리사 토메이, 알프리드 몰리나
스파이더맨: 뉴 유니버스 (Spider-Man: Into the Spider-Verse)
9.21 (참여 5785명)
애니메이션, 액션, 가족| 미국|117분 |2018
감독 : 밥 퍼시케티, 피터 램지, 로드니 로스맨|출연 : 샤메익 무어, 헤일리 스테인펠드, 니콜라스 케이지, 제이크 존슨, 리브 슈라이버, 마허샬라 알리, 브라이언 타이리 헨리
스파이더맨 (Spider-Man)
8.95 (참여 1741명)
액션, SF, 모험, 스릴러| 미국|121분 |2002
감독 : 샘 레이미|출연 : 토비 맥과이어, 윌렘 대포, 커스틴 던스트, 제임스 프랭코, 클리프 로버트슨, 로즈마리 해리스, J.K. 시몬스
스파이더맨 2 (Spider-Man 2)
8.80 (참여 2600명)
액션, 모험, 범죄, 멜로/로맨스, SF| 미국|126분 |2004
감독 : 샘 레이미|출연 : 토비 맥과이어, 커스틴 던스트
스파이더맨: 홈커밍 (Spider-Man: Homecoming)
8.36 (참여 18814명)
액션, 모험, SF| 미국|133분 |2017
감독 : 존 왓츠|출연 : 톰 홀랜드, 마이클 키튼


In [None]:
def search_naver_movie(title):
    BASE_URL = "https://movie.naver.com/movie/search/result.naver?query=스파이더맨"
    req = requests.get(BASE_URL)
    html = req.text
    soup = BeautifulSoup(html, 'html.parser')
    # CSS Selector를 통해 html요소들을 찾아낸다.
    search_result_list = soup.select(
        '.search_list_1 > li'
        )
    search_result_list[0]