# 영화 관객 수 데이터를 활용한 데이터 분석
- 감독, 이름, 상영등급, 스태프 수 등의 정보로 영화 관객 수를 예측하는 모델 개발

## 데이터
- title : 영화의 제목
- distributor : 배급사
- genre : 장르
- release_time : 개봉일
- time : 상영시간(분)
- screening_rat : 상영등급
- director : 감독이름
- dir_prev_bfnum : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화에서의 평균 관객수(단 관객수가 알려지지 않은 영화 제외)
- dir_prev_num : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화의 개수(단 관객수가 알려지지 않은 영화 제외)
- num_staff : 스텝수
- num_actor : 주연배우수
- box_off_num : 관객수

## 데이터 로딩

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/movies_train.csv')
test = pd.read_csv('data/movies_test.csv')

print(train.shape, test.shape)

(600, 12) (243, 11)


## 데이터 확인

In [2]:
# 결측값 확인
print(train.isna().sum())
print(test.isna().sum())

##################
# dir_prev_bfnum만 결측값 존재

title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    330
dir_prev_num        0
num_staff           0
num_actor           0
box_off_num         0
dtype: int64
title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    136
dir_prev_num        0
num_staff           0
num_actor           0
dtype: int64


In [3]:
# dir_prev_bfnum 값 확인
df = train[train['dir_prev_bfnum'].isna()]
print(df.shape)
print(df['director'].unique().shape)
print(df['director'].describe())

(330, 12)
(270,)
count     330
unique    270
top       홍상수
freq        6
Name: director, dtype: object


In [4]:
df['director'].unique()

array(['조병옥', '이창재', '정성복', '변성현', '조조 히데오', '양병간', '윤학렬', '홍상수', '부지영',
       '안국진', '동지견', '김재한', '손승웅', '류훈', '김철한', '한상희', '장진', '김규민', '최윤석',
       '김관철', '장희선', '김조광수', '김기덕', '오멸', '최원섭', '진승현', '도창훈', '류승완',
       '신연식', '하기호', '방형우', '김상철', '배성상', '김기영', '우민호', '최용석', '이지형',
       '이영미', '이정범', '손석', '에밀리오 에스테베즈', '김태용', '신수원', '정연식', '지민', '한동호',
       '방현준', '최시형', '엄태화', '연상호', '김진무', '임흥순', '신재영', '김경만', '타케 마사하루',
       '배창호', '권혁만', '오인천', '장희철', '양우석', '이창규', '구수환', '허은희', '김석윤',
       '고석진', '최경석', '하마구치 류스케', '홍재희', '조원희', '박형우', '곽경택', '김정환', '서호빈',
       '박헌수', '박범훈', '박준기', '임진순', '신춘수', '조성형', '한철수', '지율스님', '문승욱',
       '이규만', '박석영', '임성구', '박선욱', '김광식', '양윤호', '이대희', '장건재', '박상현',
       '정지영', '문제대', '박범수', '강의석', '이종현', 'New Pistol', '박찬경', '권오광',
       '이상호', '안재훈', '민환기', '강우석', '권우정', '노진수', '박배일', '윤태식', '구혜선',
       '박철수', '태미 추', '김종철', '김동후', '이상우', '황철민', '김혜정', '김영진', '김명서',
       '황욱', '민복기', '정흠문', '정용택', '장률', '지하진', '문정윤', '이숭환', '황병국'

In [5]:
train[train['director'] == '이창재']

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
6,길위에서,백두대간,다큐멘터리,2013-05-23,104,전체 관람가,이창재,,0,32,5,53526
557,목숨,필라멘트 픽쳐스,다큐멘터리,2014-12-04,95,12세 관람가,이창재,,0,139,4,38771


In [6]:
train.corr()

Unnamed: 0,time,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
time,1.0,0.264675,0.306727,0.623205,0.114153,0.441452
dir_prev_bfnum,0.264675,1.0,0.131822,0.323521,0.083818,0.283184
dir_prev_num,0.306727,0.131822,1.0,0.450706,0.014006,0.259674
num_staff,0.623205,0.323521,0.450706,1.0,0.077871,0.544265
num_actor,0.114153,0.083818,0.014006,0.077871,1.0,0.111179
box_off_num,0.441452,0.283184,0.259674,0.544265,0.111179,1.0


In [7]:
bfnum_mean = train[train['dir_prev_bfnum'].isna() == False]['dir_prev_bfnum'].mean()
bfnum_mode = train[train['dir_prev_bfnum'].isna() == False]['dir_prev_bfnum'].mode()
bfnum_median = train[train['dir_prev_bfnum'].isna() == False]['dir_prev_bfnum'].median()

print(bfnum_median)

478423.625


## 결측값 제거 및 변수 선택

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


In [9]:
# 명목형 변수
print(train.title.unique().shape) # 제목
print(train.distributor.unique().shape) # 배급사
print(train.genre.unique().shape) # 장르
print(train.release_time.unique().shape) # 개봉일
print(train.screening_rat.unique().shape) # 상영등급
print(train.director.unique().shape) # 감독

# 감독 컬럼은 제거
# 장르, 상영등급은 원핫 인코딩
# 개봉일은 계절로 나누기
# 배급사랑 감독은 라벨 인코딩(보류) 

(600,)
(169,)
(12,)
(330,)
(4,)
(472,)


In [10]:
train.describe()

Unnamed: 0,time,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
count,600.0,270.0,600.0,600.0,600.0,600.0
mean,100.863333,1050443.0,0.876667,151.118333,3.706667,708181.8
std,18.097528,1791408.0,1.183409,165.654671,2.446889,1828006.0
min,45.0,1.0,0.0,0.0,0.0,1.0
25%,89.0,20380.0,0.0,17.0,2.0,1297.25
50%,100.0,478423.6,0.0,82.5,3.0,12591.0
75%,114.0,1286569.0,2.0,264.0,4.0,479886.8
max,180.0,17615310.0,5.0,869.0,25.0,14262770.0


In [11]:
# 연속형 변수
# dir_prev_bfnum(결측값 많음), dir_prev_num(0값(알수 없는 값)이 많음) 제거

# 명목형 변수
# 감독 컬럼은 제거
# 장르, 상영등급은 원핫 인코딩
# 개봉일은 계절로 나누기
# 배급사랑 감독은 라벨 인코딩(보류) 

In [12]:
train.columns

Index(['title', 'distributor', 'genre', 'release_time', 'time',
       'screening_rat', 'director', 'dir_prev_bfnum', 'dir_prev_num',
       'num_staff', 'num_actor', 'box_off_num'],
      dtype='object')

In [13]:
# 개봉일 계절로 변환
# 개봉일의 월 추출
train['release_time'] = pd.to_datetime(train['release_time'])
test['release_time'] = pd.to_datetime(test['release_time'])

train['month'] = train['release_time'].dt.month
test['month'] = test['release_time'].dt.month
# print(train.head())

In [14]:
# 함수로 계절 표현

def season(month):
    if month in [3, 4, 5]:
        return '봄'
    elif month in [6, 7, 8]:
        return '여름'
    elif month in [8, 9, 10]:
        return '가을'
    else:
        return '겨울'
    
train['season'] = train['month'].apply(lambda x : season(x))
test['season'] = test['month'].apply(lambda x : season(x))

train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,month,season
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398,11,겨울
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501,11,겨울
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083,6,여름
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866,7,여름
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387,11,겨울


## 변수 구분

In [15]:
DEL_COL = ['title', 'release_time', 'dir_prev_bfnum', 'dir_prev_num', 'month', 'distributor', 'director']
COL_CAT = ['genre', 'screening_rat', 'season']
COL_NUM = ['time', 'num_staff', 'num_actor']
COL_Y = ['box_off_num']

X_train = train[COL_CAT + COL_NUM]
X_test = test[COL_CAT + COL_NUM]
y_train = train[COL_Y]
print(X_train.head())
print(X_test.head())

  genre screening_rat season  time  num_staff  num_actor
0    액션      청소년 관람불가     겨울    96         91          2
1   느와르      청소년 관람불가     겨울   130        387          3
2    액션       15세 관람가     여름   123        343          4
3   코미디        전체 관람가     여름   101         20          6
4   코미디       15세 관람가     겨울   108        251          2
    genre screening_rat season  time  num_staff  num_actor
0     느와르      청소년 관람불가     겨울   125        304          3
1  멜로/로맨스       12세 관람가     겨울   113        275          3
2     드라마       12세 관람가     겨울   115        419          7
3      액션       15세 관람가     겨울   116        408          2
4      공포       15세 관람가     겨울   110        380          1


## 범주형 변수 원핫 인코딩

In [16]:
X = pd.concat([X_train, X_test])

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[COL_CAT])
X_train_res = ohe.transform(X_train[COL_CAT])
X_test_res = ohe.transform(X_test[COL_CAT])

In [17]:
X_train_ohe = pd.DataFrame(X_train_res.todense(), columns = ohe.get_feature_names_out())
X_test_ohe = pd.DataFrame(X_test_res.todense(), columns = ohe.get_feature_names_out())

X_train = pd.concat([X_train[COL_NUM], X_train_ohe], axis = 1)
X_test = pd.concat([X_test[COL_NUM], X_test_ohe], axis = 1)

print(X_train.head())

   time  num_staff  num_actor  genre_SF  genre_공포  genre_느와르  genre_다큐멘터리  \
0    96         91          2       0.0       0.0        0.0          0.0   
1   130        387          3       0.0       0.0        1.0          0.0   
2   123        343          4       0.0       0.0        0.0          0.0   
3   101         20          6       0.0       0.0        0.0          0.0   
4   108        251          2       0.0       0.0        0.0          0.0   

   genre_드라마  genre_멜로/로맨스  genre_뮤지컬  ...  genre_액션  genre_코미디  \
0        0.0           0.0        0.0  ...       1.0        0.0   
1        0.0           0.0        0.0  ...       0.0        0.0   
2        0.0           0.0        0.0  ...       1.0        0.0   
3        0.0           0.0        0.0  ...       0.0        1.0   
4        0.0           0.0        0.0  ...       0.0        1.0   

   screening_rat_12세 관람가  screening_rat_15세 관람가  screening_rat_전체 관람가  \
0                    0.0                    0.0              

## 데이터 분할

In [18]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train.values.ravel(), test_size = 0.3)

print(X_tr.head())
print(X_tr.shape, X_val.shape)

     time  num_staff  num_actor  genre_SF  genre_공포  genre_느와르  genre_다큐멘터리  \
435   107        367          4       1.0       0.0        0.0          0.0   
246    96          4         14       0.0       0.0        0.0          1.0   
21     90        228          1       0.0       0.0        0.0          0.0   
448    86          0          4       0.0       0.0        0.0          0.0   
397   111        369          6       0.0       0.0        0.0          0.0   

     genre_드라마  genre_멜로/로맨스  genre_뮤지컬  ...  genre_액션  genre_코미디  \
435        0.0           0.0        0.0  ...       0.0        0.0   
246        0.0           0.0        0.0  ...       0.0        0.0   
21         1.0           0.0        0.0  ...       0.0        0.0   
448        1.0           0.0        0.0  ...       0.0        0.0   
397        0.0           0.0        0.0  ...       0.0        1.0   

     screening_rat_12세 관람가  screening_rat_15세 관람가  screening_rat_전체 관람가  \
435                    0.0         

## 데이터 스케일링 및 인코딩
- 수치형 변수는 스케일링 필요

In [19]:
# 스케일링
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
X_tr[COL_NUM] = scaler.fit_transform(X_tr[COL_NUM])
X_val[COL_NUM] = scaler.transform(X_val[COL_NUM])
X_test[COL_NUM] = scaler.transform(X_test[COL_NUM])

X_tr[COL_NUM].head()

Unnamed: 0,time,num_staff,num_actor
435,0.459259,0.422811,0.181818
246,0.377778,0.004608,0.636364
21,0.333333,0.262673,0.045455
448,0.303704,0.0,0.181818
397,0.488889,0.425115,0.272727


## RandomForestRegressor

In [20]:
from sklearn.ensemble import RandomForestRegressor

modelRF = RandomForestRegressor()
modelRF.fit(X_tr, y_tr)

y_tr_pred = modelRF.predict(X_tr)
y_val_pred = modelRF.predict(X_val)


In [21]:
# MSE:4.340, RMSE: 2.083, MAE: 1.617, r2: 0.949
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_tr = mean_squared_error(y_tr, y_tr_pred)
rmse_tr = mean_squared_error(y_tr, y_tr_pred, squared = False)
mae_tr = mean_absolute_error(y_tr, y_tr_pred)
r2_tr = r2_score(y_tr, y_tr_pred)

mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared = False)
mae_val = mean_absolute_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

print('MSE')
print('MSE(tr) : ', mse_tr)
print('MSE(val) : ', mse_val)
print('='*40)
print('RMSE')
print('RMSE(tr) : ', rmse_tr)
print('RMSE(val) : ', rmse_val)
print('='*40)
print('MAE')
print('MAE(tr) : ', mae_tr)
print('MAE(val) : ', mae_val)
print('='*40)
print('r2_score')
print('r2_score(tr) : ', r2_tr)
print('r2_score(val) : ', r2_val)
print('='*40)


MSE
MSE(tr) :  369705023558.48395
MSE(val) :  2522193034933.355
RMSE
RMSE(tr) :  608033.7355430897
RMSE(val) :  1588141.3775018125
MAE
MAE(tr) :  257263.00919047618
MAE(val) :  727995.5421111112
r2_score
r2_score(tr) :  0.8952627367388097
r2_score(val) :  0.12262910273278715


In [22]:
importance = pd.DataFrame({'feature' : X_tr.columns, 'feature_importance' : modelRF.feature_importances_})
importance = importance.sort_values('feature_importance')

importance

Unnamed: 0,feature,feature_importance
11,genre_서스펜스,0.0
9,genre_뮤지컬,3e-06
10,genre_미스터리,7.8e-05
6,genre_다큐멘터리,0.0001
12,genre_애니메이션,0.000671
4,genre_공포,0.00078
17,screening_rat_전체 관람가,0.001476
8,genre_멜로/로맨스,0.005261
15,screening_rat_12세 관람가,0.007608
13,genre_액션,0.008364


### 하이퍼파라미터 튜닝

In [23]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : range(100, 601, 50),
    'max_depth' : range(2, 11, 2)
}

gs = GridSearchCV(RandomForestRegressor(),
                        params,
                        scoring = 'neg_mean_squared_error',
                        cv = 4,
                        n_jobs = -1
                        )
gs.fit(X_tr, y_tr)

GridSearchCV(cv=4, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': range(2, 11, 2),
                         'n_estimators': range(100, 601, 50)},
             scoring='neg_mean_squared_error')

In [24]:
gs.best_params_

{'max_depth': 2, 'n_estimators': 200}

In [25]:
from sklearn.ensemble import RandomForestRegressor

modelRF = RandomForestRegressor(n_estimators = 200, max_depth = 2)
modelRF.fit(X_tr, y_tr)

y_tr_pred = modelRF.predict(X_tr)
y_val_pred = modelRF.predict(X_val)

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_tr = mean_squared_error(y_tr, y_tr_pred)
rmse_tr = mean_squared_error(y_tr, y_tr_pred, squared = False)
mae_tr = mean_absolute_error(y_tr, y_tr_pred)
r2_tr = r2_score(y_tr, y_tr_pred)

mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared = False)
mae_val = mean_absolute_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

print('MSE')
print('MSE(tr) : ', mse_tr)
print('MSE(val) : ', mse_val)
print('='*40)
print('RMSE')
print('RMSE(tr) : ', rmse_tr)
print('RMSE(val) : ', rmse_val)
print('='*40)
print('MAE')
print('MAE(tr) : ', mae_tr)
print('MAE(val) : ', mae_val)
print('='*40)
print('r2_score')
print('r2_score(tr) : ', r2_tr)
print('r2_score(val) : ', r2_val)
print('='*40)

MSE
MSE(tr) :  1727463904805.8154
MSE(val) :  2333127056065.561
RMSE
RMSE(tr) :  1314330.2114787651
RMSE(val) :  1527457.7100743446
MAE
MAE(tr) :  590602.9917829393
MAE(val) :  725996.6535216025
r2_score
r2_score(tr) :  0.5106102696945667
r2_score(val) :  0.18839765621954407


## XGBRegressor

In [28]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor()
model_xgb.fit(X_tr, y_tr)

y_tr_pred_xgb = model_xgb.predict(X_tr)
y_val_pred_xgb = model_xgb.predict(X_val)

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_tr = mean_squared_error(y_tr, y_tr_pred_xgb)
rmse_tr = mean_squared_error(y_tr, y_tr_pred_xgb, squared = False)
mae_tr = mean_absolute_error(y_tr, y_tr_pred_xgb)
r2_tr = r2_score(y_tr, y_tr_pred_xgb)

mse_val = mean_squared_error(y_val, y_val_pred_xgb)
rmse_val = mean_squared_error(y_val, y_val_pred_xgb, squared = False)
mae_val = mean_absolute_error(y_val, y_val_pred_xgb)
r2_val = r2_score(y_val, y_val_pred_xgb)

print('MSE')
print('MSE(tr) : ', mse_tr)
print('MSE(val) : ', mse_val)
print('='*40)
print('RMSE')
print('RMSE(tr) : ', rmse_tr)
print('RMSE(val) : ', rmse_val)
print('='*40)
print('MAE')
print('MAE(tr) : ', mae_tr)
print('MAE(val) : ', mae_val)
print('='*40)
print('r2_score')
print('r2_score(tr) : ', r2_tr)
print('r2_score(val) : ', r2_val)
print('='*40)

MSE
MSE(tr) :  59825314.56681081
MSE(val) :  3848475932932.3237
RMSE
RMSE(tr) :  7734.682576991173
RMSE(val) :  1961753.2803419302
MAE
MAE(tr) :  5349.935772269113
MAE(val) :  826973.4790108575
r2_score
r2_score(tr) :  0.9999830515158784
r2_score(val) :  -0.33873210163603784


### 하이퍼파라미터 튜닝

In [30]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : range(100, 601, 50),
    'max_depth' : range(2, 11, 2),
    'learning_rate' : [0.1, 0.05, 0.01, 0.005, 0.001]
}

gs = GridSearchCV(XGBRegressor(),
                        params,
                        cv = 4,
                        n_jobs = -1
                        )
gs.fit(X_tr, y_tr)

GridSearchCV(cv=4,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estim

In [31]:
gs.best_params_

{'learning_rate': 0.005, 'max_depth': 2, 'n_estimators': 300}

In [32]:
model_xgb = XGBRegressor(n_estimators = 300, max_depth = 2, learning_rate = 0.005)
model_xgb.fit(X_tr, y_tr)

y_tr_pred_xgb = model_xgb.predict(X_tr)
y_val_pred_xgb = model_xgb.predict(X_val)

In [33]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_tr = mean_squared_error(y_tr, y_tr_pred_xgb)
rmse_tr = mean_squared_error(y_tr, y_tr_pred_xgb, squared = False)
mae_tr = mean_absolute_error(y_tr, y_tr_pred_xgb)
r2_tr = r2_score(y_tr, y_tr_pred_xgb)

mse_val = mean_squared_error(y_val, y_val_pred_xgb)
rmse_val = mean_squared_error(y_val, y_val_pred_xgb, squared = False)
mae_val = mean_absolute_error(y_val, y_val_pred_xgb)
r2_val = r2_score(y_val, y_val_pred_xgb)

print('MSE')
print('MSE(tr) : ', mse_tr)
print('MSE(val) : ', mse_val)
print('='*40)
print('RMSE')
print('RMSE(tr) : ', rmse_tr)
print('RMSE(val) : ', rmse_val)
print('='*40)
print('MAE')
print('MAE(tr) : ', mae_tr)
print('MAE(val) : ', mae_val)
print('='*40)
print('r2_score')
print('r2_score(tr) : ', r2_tr)
print('r2_score(val) : ', r2_val)
print('='*40)

MSE
MSE(tr) :  1831112240907.9062
MSE(val) :  2334211884313.1963
RMSE
RMSE(tr) :  1353185.9594704292
RMSE(val) :  1527812.7778995684
MAE
MAE(tr) :  543725.5032366072
MAE(val) :  647252.8234157986
r2_score
r2_score(tr) :  0.4812467437126382
r2_score(val) :  0.1880202875091299


## 최종 예측

In [36]:
y_test_pred = modelRF.predict(X_test)
y_test_pred_xgb = model_xgb.predict(X_test)
print(test.shape)
print(y_test_pred.shape)
print(y_test_pred_xgb.shape)

(243, 13)
(243,)
(243,)


In [40]:
submission = pd.read_csv('data/submission.csv')

pd.DataFrame({'title' : submission.title, 'box_off_num(RF)' : y_test_pred, 'box_off_num(XGB)' : y_test_pred_xgb}).to_csv('result/submission.csv', index = False)
pd.DataFrame({'title' : submission.title, 'box_off_num' : y_test_pred}).to_csv('result/submission_RandomForest.csv', index = False)
pd.DataFrame({'title' : submission.title, 'box_off_num' : y_test_pred_xgb}).to_csv('result/submission_XGB.csv', index = False)

In [41]:
result = pd.read_csv('result/submission.csv')

result

Unnamed: 0,title,box_off_num(RF),box_off_num(XGB)
0,용서는 없다,3.394953e+06,2906236.200
1,아빠가 여자를 좋아해,1.020210e+06,722972.800
2,하모니,1.077590e+06,808678.250
3,의형제,1.077590e+06,808678.250
4,평행 이론,1.060314e+06,808678.250
...,...,...,...
238,해에게서 소년에게,9.004940e+04,62935.680
239,울보 권투부,9.004940e+04,62935.680
240,어떤살인,1.146406e+05,122958.625
241,말하지 못한 비밀,9.004940e+04,78492.930
