In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pypapago import Translator

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KDH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
cur_movie = pd.read_csv('../data/current_movie.csv',index_col =0)
cur_movie_review = pd.read_csv('../data/current_movie_review.csv', index_col=0)

In [15]:
def get_tags_to_txt(text):
    if type(text) != str:
        return ''
    text = word_tokenize(text)
    tags_dump = nltk.pos_tag(text)
    tags = []
    
    for word in tags_dump:
        if (word[1] == 'JJ') and (word[0] not in stop_words):
            tags.append(delete_punctuation_marks_tag(word[0]))
            
    tags = ','.join(tags)
    
    return tags

def delete_punctuation_marks_tag(word):
    word = word.replace(",","")
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace("!","")
    word = word.replace(":","")
    word = word.replace("?","")
    word = word.replace("~","")
    word = word.replace("\\","")
    word = word.replace("\"","")
    word = word.replace(";","")
    return word.lower()

In [4]:
cur_movie_review.reset_index(drop=True,inplace=True)
ts = Translator()
cur_movie_review['review_eng'] = ''

#translating review
for id in tqdm(cur_movie_review.index):
    review = cur_movie_review.loc[id,'review']
    
    if type(review)==float:
        continue

    review_pre = delete_punctuation_marks_review(review)

    try:
        review_ts = ts.translate(review_pre, source='ko', target='en', verbose=False)
    except:
        review_ts = ''
        
    cur_movie_review.loc[id,'review_eng'] = review_ts

100%|██████████████████████████████████████████████████████████████████████████| 11836/11836 [1:52:09<00:00,  1.76it/s]


In [5]:
cur_movie_review.head()

Unnamed: 0,mid,user,rate,review,review_eng
0,0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,a special grade part-time job He keeps saying...
1,0,황금마차(goga****),1,평점 알바풀었나...,Did you finish your part-time job?
2,0,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,"I just hope the actors' acting is good, but I ..."
3,0,깁스(volk****),1,실망 뻔한스토리에 억지스럼,"A story that could have been disappointing, so..."
4,0,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,I'm not sure if I'm not sure if I'm talking ab...


In [16]:
for i in tqdm(cur_movie_review.index):
    cur_movie_review.loc[i,'tags'] = get_tags_to_txt(cur_movie_review.loc[i,'review_eng'])

100%|███████████████████████████████████████████████████████████████████████████| 11836/11836 [00:50<00:00, 233.80it/s]


In [28]:
cur_movie_review.head()

Unnamed: 0,mid,user,rate,review,review_eng,tags
0,0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,a special grade part-time job He keeps saying...,"special,part-time"
1,0,황금마차(goga****),1,평점 알바풀었나...,Did you finish your part-time job?,part-time
2,0,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,"I just hope the actors' acting is good, but I ...","good,main,forgotten,great,sin-pa,whole,short,w..."
3,0,깁스(volk****),1,실망 뻔한스토리에 억지스럼,"A story that could have been disappointing, so...","disappointing,forced"
4,0,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,I'm not sure if I'm not sure if I'm talking ab...,"sure,sure,new"


In [18]:
cur_movie

Unnamed: 0,name,url
0,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...
1,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...
2,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...
3,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...
4,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...
...,...,...
149,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...
150,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...
151,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...
152,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...


In [42]:
movies = pd.read_csv('../data/movie_watcha_info.csv', index_col=0)

In [43]:
movies

Unnamed: 0,id,title,title_ko,rate,year,country,genre,running_time,contents
0,0,김씨 표류기,김씨 표류기,4,2009,한국,드라마/코미디,1시간 56분,자살시도가 실패로 끝나 한강의 밤섬에 불시착한 남자. 죽는 것도 쉽지 않자 일단 섬...
1,1,The Girl on the Train,걸 온 더 트레인,2.5,2016,미국,범죄/드라마/미스터리/스릴러,1시간 52분,이혼 후 알코올중독에 빠져 심각한 블랙아웃 (음주 후 필름 끊기는 현상) 을 겪고 ...
2,2,Ένας Άλλος Κόσμος,"나의 사랑, 그리스",4,2015,그리스,드라마/로맨스,1시간 54분,그리스를 배경으로 펼쳐지는 각기 다른 세대 세 커플의 사랑 이야기. [부메랑] 여대...
3,3,LA CARA OCULTA,히든 페이스,3,2011,"콜롬비아,스페인",코미디/드라마/미스터리/스릴러,1시간 37분,오케스트라 지휘자 아드리안은 1년간 콜롬비아의 보고타 필하모닉의 지휘자로 일하게되어...
4,4,Deepwater Horizon,딥워터 호라이즌,3.5,2016,"홍콩,미국",액션/드라마/역사/스릴러/재난,1시간 47분,"2010년 4월 20일, 미국 루이지애나주 앞바다 멕시코만 석유 시추선 ‘딥워터 호..."
...,...,...,...,...,...,...,...,...,...
666,666,Pirates of the Caribbean: The Curse of the Bla...,캐리비안의 해적 - 블랙 펄의 저주,3,2003,미국,액션/판타지/모험,2시간 23분,매력 넘치는 해적 캡틴 잭 스패로우(죠니 뎁 분)에게 수정처럼 맑고 투명한 카리브 ...
667,667,Thor,토르: 천둥의 신,3,2011,미국,액션/모험/판타지/SF,1시간 52분,신의 세계 ‘아스가르드’의 후계자로 강력한 파워를 지닌 천둥의 신 ‘토르’. 평소 ...
668,668,不能説的・秘密,말할 수 없는 비밀,3,2007,대만,로맨스/판타지/드라마,1시간 41분,예술학교로 전학 온 상륜(주걸륜)은 아버지의 영향을 받아 피아노에 천부적인 소질을 ...
669,669,Harry Potter and the Prisoner of Azkaban,해리포터와 아즈카반의 죄수,4,2004,"영국,미국",판타지/미스터리/가족/모험/액션,2시간 21분,13세가 된 해리 포터(다니엘 래드클리프)는 또 한번의 여름 방학을 이모 가족인 더...


In [44]:
tmp = []
for cm in cur_movie['name']:
    if cm in set(movies['title_ko']):
        continue
    else:
        tmp.append(cm)

In [45]:
len(tmp)

142

In [46]:
new_movie = pd.DataFrame({"title_ko":tmp})
movies = movies.append(new_movie, ignore_index=True)

In [47]:
movies['id'] = movies.index
movies

Unnamed: 0,contents,country,genre,id,rate,running_time,title,title_ko,year
0,자살시도가 실패로 끝나 한강의 밤섬에 불시착한 남자. 죽는 것도 쉽지 않자 일단 섬...,한국,드라마/코미디,0,4,1시간 56분,김씨 표류기,김씨 표류기,2009.0
1,이혼 후 알코올중독에 빠져 심각한 블랙아웃 (음주 후 필름 끊기는 현상) 을 겪고 ...,미국,범죄/드라마/미스터리/스릴러,1,2.5,1시간 52분,The Girl on the Train,걸 온 더 트레인,2016.0
2,그리스를 배경으로 펼쳐지는 각기 다른 세대 세 커플의 사랑 이야기. [부메랑] 여대...,그리스,드라마/로맨스,2,4,1시간 54분,Ένας Άλλος Κόσμος,"나의 사랑, 그리스",2015.0
3,오케스트라 지휘자 아드리안은 1년간 콜롬비아의 보고타 필하모닉의 지휘자로 일하게되어...,"콜롬비아,스페인",코미디/드라마/미스터리/스릴러,3,3,1시간 37분,LA CARA OCULTA,히든 페이스,2011.0
4,"2010년 4월 20일, 미국 루이지애나주 앞바다 멕시코만 석유 시추선 ‘딥워터 호...","홍콩,미국",액션/드라마/역사/스릴러/재난,4,3.5,1시간 47분,Deepwater Horizon,딥워터 호라이즌,2016.0
...,...,...,...,...,...,...,...,...,...
807,,,,807,,,,창문넘어 도망친 100세 노인,
808,,,,808,,,,첫사랑,
809,,,,809,,,,캐리,
810,,,,810,,,,프랭크,


In [52]:
cur_movie['tags'] = ''
for num in cur_movie_review['mid'].unique():
    tags = ''
    tmp = cur_movie_review[cur_movie_review['mid']==num]
    for tag in tmp['tags']:
        tags+=tag +','
    cur_movie.loc[num,'tags'] = tags

In [90]:
tmp = movies[movies['title'].isna()][['id','title_ko']]
tmp

Unnamed: 0,id,title_ko
670,670,결백
671,671,온워드: 단 하루의 기적
672,672,사라진 시간
673,673,침입자
674,674,슈퍼스타 뚜루
...,...,...
807,807,창문넘어 도망친 100세 노인
808,808,첫사랑
809,809,캐리
810,810,프랭크


In [91]:
tmp = tmp.rename(columns = {'title_ko':'name'})

In [92]:
tmp

Unnamed: 0,id,name
670,670,결백
671,671,온워드: 단 하루의 기적
672,672,사라진 시간
673,673,침입자
674,674,슈퍼스타 뚜루
...,...,...
807,807,창문넘어 도망친 100세 노인
808,808,첫사랑
809,809,캐리
810,810,프랭크


In [93]:
cur_movie

Unnamed: 0,name,url,tags
0,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"special,part-time,part-time,good,main,forgotte..."
1,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"brilliant,much,emotional,alive,visual,unsympat..."
2,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...,",precious,precious,main,,,next,,curious,compli..."
3,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"weird,good,good,interesting,obvious,running,lo..."
4,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"worth,healthy,good,,funny,exciting,awesome,ele..."
...,...,...,...
149,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"much,funny,non-serious,funny,charming,turbulen..."
150,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"precious,personal,first,unforgettable,i,sad,lo..."
151,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"fast-paced,first,intense,blatant,miserable,goo..."
152,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"wrong,,long,real,good,popular,,,real,music-mad..."


In [94]:
tmp = pd.merge(cur_movie, tmp, on='name').reset_index(drop=True)
tmp['id'] = [i for i in range(670,812)]
tmp

Unnamed: 0,name,url,tags,id
0,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"special,part-time,part-time,good,main,forgotte...",670
1,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"brilliant,much,emotional,alive,visual,unsympat...",671
2,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...,",precious,precious,main,,,next,,curious,compli...",672
3,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"weird,good,good,interesting,obvious,running,lo...",673
4,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"worth,healthy,good,,funny,exciting,awesome,ele...",674
...,...,...,...,...
137,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"much,funny,non-serious,funny,charming,turbulen...",807
138,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"precious,personal,first,unforgettable,i,sad,lo...",808
139,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"fast-paced,first,intense,blatant,miserable,goo...",809
140,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"wrong,,long,real,good,popular,,,real,music-mad...",810


In [95]:
tmp.set_index('id',drop=True, inplace=True)
tmp

Unnamed: 0_level_0,name,url,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
670,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"special,part-time,part-time,good,main,forgotte..."
671,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"brilliant,much,emotional,alive,visual,unsympat..."
672,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...,",precious,precious,main,,,next,,curious,compli..."
673,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"weird,good,good,interesting,obvious,running,lo..."
674,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"worth,healthy,good,,funny,exciting,awesome,ele..."
...,...,...,...
807,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"much,funny,non-serious,funny,charming,turbulen..."
808,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"precious,personal,first,unforgettable,i,sad,lo..."
809,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"fast-paced,first,intense,blatant,miserable,goo..."
810,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"wrong,,long,real,good,popular,,,real,music-mad..."


In [97]:
tmp.to_csv('../data/current_movie.csv')

In [118]:
clusters = pd.read_csv('../data/movie_cluster.csv')
movie_matrix = pd.read_csv('../data/movie_matrix.csv',index_col=0)

In [119]:
movie_matrix

Unnamed: 0_level_0,group_74,group_0,group_1,group_2,group_3,group_4,group_5,group_6,group_7,group_8,...,group_90,group_91,group_92,group_93,group_94,group_95,group_96,group_97,group_98,group_99
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,0,0,1,0,1,2,1,7,1,...,0,0,2,0,0,3,0,0,2,1
1,2,0,1,3,0,1,4,4,12,0,...,0,0,0,0,0,5,0,1,2,0
2,4,1,0,1,2,0,4,1,12,0,...,0,0,0,2,0,0,0,1,2,0
3,3,0,0,2,0,0,3,2,2,1,...,0,0,0,1,0,3,0,0,1,0
4,1,0,1,0,0,0,5,0,10,1,...,0,0,0,0,0,1,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,2,0,0,1,0,0,3,2,9,0,...,0,0,1,1,0,1,0,0,1,0
666,3,0,0,0,0,0,1,0,1,0,...,0,0,0,5,0,3,0,0,3,1
667,0,0,0,2,0,1,2,5,6,0,...,0,0,0,2,0,0,0,0,1,0
668,2,0,2,0,0,0,0,0,13,2,...,0,0,1,1,0,2,0,0,2,0


In [120]:
for i in range(len(tmp)):
    movie_matrix = movie_matrix.append(pd.Series(), ignore_index=True)
movie_matrix.fillna(0, inplace=True)

In [121]:
from collections import Counter

for id in tqdm(tmp.index):
    tags = Counter(tmp.loc[id,'tags'].split(','))
    for tag in tags.keys():
        for num in range(0,100):
            if tag in set(clusters['group_'+str(num)]):
                movie_matrix.loc[id,'group_'+str(num)] += tags[tag]


  0%|                                                                                          | 0/142 [00:00<?, ?it/s]
  1%|▌                                                                                 | 1/142 [00:02<06:44,  2.87s/it]
  1%|█▏                                                                                | 2/142 [00:03<05:19,  2.28s/it]
  2%|█▋                                                                                | 3/142 [00:04<04:25,  1.91s/it]
  3%|██▎                                                                               | 4/142 [00:05<03:26,  1.50s/it]
  4%|██▉                                                                               | 5/142 [00:05<02:27,  1.08s/it]
  4%|███▍                                                                              | 6/142 [00:05<01:47,  1.27it/s]
  5%|████                                                                              | 7/142 [00:06<01:40,  1.35it/s]
  6%|████▌                             

In [122]:
movie_matrix

Unnamed: 0,group_74,group_0,group_1,group_2,group_3,group_4,group_5,group_6,group_7,group_8,...,group_90,group_91,group_92,group_93,group_94,group_95,group_96,group_97,group_98,group_99
0,3.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,7.0,1.0,...,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0
1,2.0,0.0,1.0,3.0,0.0,1.0,4.0,4.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,2.0,0.0
2,4.0,1.0,0.0,1.0,2.0,0.0,4.0,1.0,12.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0
3,3.0,0.0,0.0,2.0,0.0,0.0,3.0,2.0,2.0,1.0,...,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,10.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,1.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,6.0,2.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0
808,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
809,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,14.0,0.0,...,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,3.0,0.0
810,4.0,0.0,2.0,1.0,0.0,0.0,6.0,1.0,15.0,0.0,...,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,4.0,0.0


In [123]:
movie_matrix.to_csv('../data/movie_matrix.csv')

In [9]:
score_board = pd.read_csv('../data/score_board.csv')
raw_cur_movie = pd.read_csv('../data/current_movie_raw.csv',index_col=0)
score_board

Unnamed: 0,u_id,p_id,rating
0,*2,121,4.5
1,-채여니영화평-,287,4.0
2,-채여니영화평-,414,3.5
3,-채여니영화평-,271,3.5
4,-채여니영화평-,264,5.0
...,...,...,...
29793,dpwls258,286,
29794,쥬,286,
29795,쥬,205,
29796,쥬,166,


In [10]:
raw_cur_movie

Unnamed: 0,name,url
0,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...
1,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...
2,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...
3,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...
4,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...
...,...,...
149,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...
150,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...
151,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...
152,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...


In [15]:
del raw_cur_movie['url']
raw_cur_movie['raw_id'] = raw_cur_movie.index

In [17]:
new_df = pd.merge(cur_movie, raw_cur_movie, on='name', how='inner')
new_df.index = [i for i in range(670,812)]
new_df

Unnamed: 0,name,url,tags,raw_id
670,결백,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"special,part-time,part-time,good,main,forgotte...",0
671,온워드: 단 하루의 기적,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"brilliant,much,emotional,alive,visual,unsympat...",1
672,사라진 시간,https://movie.naver.com/movie/bi/mi/basic.nhn?...,",precious,precious,main,,,next,,curious,compli...",2
673,침입자,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"weird,good,good,interesting,obvious,running,lo...",3
674,슈퍼스타 뚜루,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"worth,healthy,good,,funny,exciting,awesome,ele...",4
...,...,...,...,...
807,창문넘어 도망친 100세 노인,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"much,funny,non-serious,funny,charming,turbulen...",149
808,첫사랑,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"precious,personal,first,unforgettable,i,sad,lo...",150
809,캐리,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"fast-paced,first,intense,blatant,miserable,goo...",151
810,프랭크,https://movie.naver.com/movie/bi/mi/basic.nhn?...,"wrong,,long,real,good,popular,,,real,music-mad...",152


In [19]:
change = dict()
for idx in new_df.index:
    change[new_df.loc[idx,'raw_id']] = idx

In [5]:
cur_movie_review

Unnamed: 0,mid,user,rate,review
0,0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...
1,0,황금마차(goga****),1,평점 알바풀었나...
2,0,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...
3,0,깁스(volk****),1,실망 뻔한스토리에 억지스럼
4,0,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음
...,...,...,...,...
5,153,안경잡이(ayan****),8,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...
6,153,개럭(jtot****),9,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.
7,153,뉴존(newz****),10,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...
8,153,무탈 Watanabe(yaso****),8,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.


In [21]:
tmp = []
for i in cur_movie_review['mid']:
    try:
        tmp.append(change[i])
    except:
        tmp.append(999)
cur_movie_review['new_id'] = tmp

In [22]:
cur_movie_review

Unnamed: 0,mid,user,rate,review,new_id
0,0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,670
1,0,황금마차(goga****),1,평점 알바풀었나...,670
2,0,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,670
3,0,깁스(volk****),1,실망 뻔한스토리에 억지스럼,670
4,0,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,670
...,...,...,...,...,...
5,153,안경잡이(ayan****),8,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...,811
6,153,개럭(jtot****),9,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.,811
7,153,뉴존(newz****),10,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...,811
8,153,무탈 Watanabe(yaso****),8,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.,811


In [23]:
cur_movie_review = cur_movie_review[cur_movie_review['new_id']<999]
cur_movie_review

Unnamed: 0,mid,user,rate,review,new_id
0,0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,670
1,0,황금마차(goga****),1,평점 알바풀었나...,670
2,0,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,670
3,0,깁스(volk****),1,실망 뻔한스토리에 억지스럼,670
4,0,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,670
...,...,...,...,...,...
5,153,안경잡이(ayan****),8,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...,811
6,153,개럭(jtot****),9,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.,811
7,153,뉴존(newz****),10,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...,811
8,153,무탈 Watanabe(yaso****),8,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.,811


In [24]:
del cur_movie_review['mid']

In [25]:
cur_movie_review.rename(columns={'new_id':'p_id', 'user':'u_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [26]:
cur_movie_review

Unnamed: 0,u_id,rate,review,p_id
0,허현(heoh****),6,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,670
1,황금마차(goga****),1,평점 알바풀었나...,670
2,케구리(jhw6****),2,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,670
3,깁스(volk****),1,실망 뻔한스토리에 억지스럼,670
4,달고나(mjh5****),1,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,670
...,...,...,...,...
5,안경잡이(ayan****),8,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...,811
6,개럭(jtot****),9,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.,811
7,뉴존(newz****),10,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...,811
8,무탈 Watanabe(yaso****),8,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.,811


In [27]:
cur_movie_review.reset_index(drop=True, inplace=True)

In [28]:
cur_movie_review['rate'] /= 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
cur_movie_review

Unnamed: 0,u_id,rate,review,p_id
0,허현(heoh****),3.0,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,670
1,황금마차(goga****),0.5,평점 알바풀었나...,670
2,케구리(jhw6****),1.0,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,670
3,깁스(volk****),0.5,실망 뻔한스토리에 억지스럼,670
4,달고나(mjh5****),0.5,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,670
...,...,...,...,...
10631,안경잡이(ayan****),4.0,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...,811
10632,개럭(jtot****),4.5,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.,811
10633,뉴존(newz****),5.0,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...,811
10634,무탈 Watanabe(yaso****),4.0,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.,811


In [39]:
cur_movie_review['u_id'] = [i[0] for i in cur_movie_review['u_id'].str.split('(')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
cur_movie_review

Unnamed: 0,u_id,rate,review,p_id
0,허현,3.0,평점 알바 특. 내용얘기보다 배우 연기 좋단말만 주구장창함...,670
1,황금마차,0.5,평점 알바풀었나...,670
2,케구리,1.0,배우들의 연기만 좋았음 하지만 단점이 확실히 보인다. 영화 초반부 사건이 터지고 주...,670
3,깁스,0.5,실망 뻔한스토리에 억지스럼,670
4,달고나,0.5,뻔한 신파극 평점알바풀었나 배우들 연기만 좋음,670
...,...,...,...,...
10631,안경잡이,4.0,초등학교때 이 영화 보고 1주일을 앓아 누웠던 기억이..ㅎㄷㄷ 그리고 시간이 많이 ...,811
10632,개럭,4.5,처음에는 얼굴 가리면서 봤지만 어느새 적응하고 있는 내 모습.,811
10633,뉴존,5.0,지금 생각해도 당시 획기적인 아이디어에 과학의 신선함과 공포를 한번에 느끼게해준 ...,811
10634,무탈 Watanabe,4.0,감독 특유의 기괴한 상상. 그래도 요즘 웬만한 스릴러 영화보다 훨씬 낫다.,811


In [41]:
cur_movie_review.u_id.value_counts()

RED         17
라이너군        14
LP          11
라즈알굴        10
푸른불          9
            ..
ami2****     1
pong         1
jsyh****     1
괄            1
동글동글이        1
Name: u_id, Length: 9014, dtype: int64

In [44]:
data={
    'u_id':cur_movie_review['u_id'],
    'p_id':cur_movie_review['p_id'],
    'rating':cur_movie_review['rate']
}
new_score_board=pd.DataFrame(data, columns=['u_id','p_id','rating'])
new_score_board.sort_values(by=['u_id'], axis=0,inplace=True)
new_score_board.reset_index(drop = True, inplace=True)
new_score_board

Unnamed: 0,u_id,p_id,rating
0,00,709,2.0
1,00,755,5.0
2,000,780,3.5
3,000,720,4.0
4,000,695,0.5
...,...,...,...
10631,힐리우스,770,5.0
10632,힐리우스,745,4.5
10633,힘내라힘,740,0.5
10634,힘멜,749,4.0


In [46]:
score_board = score_board.append(new_score_board, ignore_index = True)
score_board

Unnamed: 0,u_id,p_id,rating
0,*2,121,4.5
1,-채여니영화평-,287,4.0
2,-채여니영화평-,414,3.5
3,-채여니영화평-,271,3.5
4,-채여니영화평-,264,5.0
...,...,...,...
40429,힐리우스,770,5
40430,힐리우스,745,4.5
40431,힘내라힘,740,0.5
40432,힘멜,749,4


In [47]:
score_board.to_csv('../data/score_board.csv', index=False)