In [174]:
import pandas as pd
import numpy as np
import ast

movie_df = pd.read_csv('movies_1980_2025.csv')


def safe_literal_eval_clean(x):
    # None이나 NaN 값 먼저 걸러냄
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []

    try:
        # 문자열인 경우: literal_eval 시도
        if isinstance(x, str):
            x = x.replace('""', '"').strip()
            if x.startswith('[') and x.endswith(']'):
                result = ast.literal_eval(x)
            else:
                result = [x]
        # 리스트인 경우: 그대로 사용
        elif isinstance(x, list):
            result = x
        # 그 외는 단일값으로 리스트화
        else:
            result = [x]

        # 결과가 리스트인 경우, 요소를 문자열로 정리
        if isinstance(result, list):
            return [str(item).strip() for item in result if item is not None and str(item).strip() != '']
        else:
            return [str(result).strip()]

    except:
        return []



target_cols = [
    'writers',
    'directors',
    'stars',
    'production_company',
    'Languages',
    'countries_origin',
    'genres_imdb'
]

for col in target_cols:
    movie_df[col] = movie_df[col].apply(safe_literal_eval_clean)

In [175]:
type(movie_df['genres_imdb'].iloc[0])

list

In [176]:
from collections import Counter
genre_counts = Counter([g for sublist in movie_df['genres_imdb'] for g in sublist])
print(sorted(genre_counts.items()))

[('Action', 4136), ('Adult', 173), ('Adventure', 2674), ('Animation', 863), ('Biography', 1421), ('Comedy', 7740), ('Crime', 3459), ('Documentary', 1798), ('Drama', 11985), ('Family', 1496), ('Fantasy', 1918), ('Game Show', 1), ('History', 984), ('Horror', 2708), ('Music', 1438), ('Musical', 489), ('Mystery', 2109), ('News', 51), ('Romance', 4543), ('Sci-Fi', 1869), ('Sport', 673), ('Thriller', 5394), ('War', 787), ('Western', 283)]


## 다중 장르 One-Hot encoding

In [177]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_one_hot = mlb.fit_transform(movie_df['genres_imdb'])
genres_df = pd.DataFrame(genres_one_hot, columns=mlb.classes_)

# 기존 movie_df에 붙이기
movie_df = pd.concat([movie_df, genres_df], axis=1)

# 인코딩한 클래스 종류
print(sorted(mlb.classes_))

['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [178]:
# 인코딩 결과 테스트
movie_df.loc[0:0, ['Action', 'Adventure', 'Fantasy', 'Horror', 'Drama', 'Game Show', 'History']]

Unnamed: 0,Action,Adventure,Fantasy,Horror,Drama,Game Show,History
0,1,1,1,0,0,0,0


## 상영등급 One-Hot Encoding

In [None]:
mpa_ohe = pd.get_dummies(movie_df['MPA'], prefix='MPA', dtype=int)

# 기존 movie_df에 병합
movie_df = pd.concat([movie_df, mpa_ohe], axis=1)

movie_df.columns

Index(['Title', 'Year', 'MPA', 'Rating', 'Votes', 'writers', 'directors',
       'stars', 'countries_origin', 'production_company', 'Languages',
       'Duration_minute', 'budget_usd', 'genres_imdb', 'popularity_score',
       'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game Show',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance',
       'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western', 'MPA_13+', 'MPA_16+',
       'MPA_18+', 'MPA_Approved', 'MPA_G', 'MPA_M', 'MPA_MA-17', 'MPA_NC-17',
       'MPA_Not Rated', 'MPA_PG', 'MPA_PG-13', 'MPA_R', 'MPA_TV-14',
       'MPA_TV-G', 'MPA_TV-MA', 'MPA_TV-PG', 'MPA_TV-Y', 'MPA_TV-Y7',
       'MPA_TV-Y7-FV', 'MPA_Unrated', 'MPA_X'],
      dtype='object')

# 아무것도 안하고 그냥 돌려보기

In [190]:
from sklearn.model_selection import train_test_split
X = movie_df.drop(['Title','Year','genres_imdb','MPA', 'Rating', 'Votes', 'writers','directors','stars','countries_origin','production_company','Languages','budget_usd','popularity_score'], axis=1)
y = movie_df['popularity_score']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [191]:
from sklearn.svm import SVR

svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

y_pred_train = svr_rbf.predict(X_train)
y_pred_test = svr_rbf.predict(X_test)

In [193]:
from sklearn.metrics import mean_squared_error          # MSE (평균 제곱 오차)
from sklearn.metrics import root_mean_squared_error     # RMSE (제곱 평균 제곱근 오차)
from sklearn.metrics import mean_absolute_error         # MAE (평균 절대 오차)
from sklearn.metrics import mean_squared_log_error      # MSLE (평균 제곱 로그 오차)
from sklearn.metrics import root_mean_squared_log_error # RMSLE (제곱 평균 제곱근 로그 오차)
from sklearn.metrics import r2_score                    # R^2 결정계수

def eval(y_test, y_pred):
    print(f"MSE:",mean_squared_error(y_test, y_pred))
    print(f"RMSE:", root_mean_squared_error(y_test, y_pred))
    print(f"MAE:", mean_absolute_error(y_test, y_pred))
    print(f"MSLE:", mean_squared_log_error(y_test, y_pred))
    print(f"RMSLE:", root_mean_squared_log_error(y_test, y_pred))
    print(f"R2:", r2_score(y_test, y_pred))

print("<훈련 데이터 점수>")  
eval(y_train, y_pred_train)

print()

print("<평가 데이터 점수")  
eval(y_test, y_pred_test)

<훈련 데이터 점수>
MSE: 1116116399361.1191
RMSE: 1056464.1022586233
MAE: 303404.12735620415
MSLE: 4.9232041017616135
RMSLE: 2.218829444045128
R2: -0.07532818226409321

<평가 데이터 점수
MSE: 1381715263213.3066
RMSE: 1175463.8502367083
MAE: 317057.94153826626
MSLE: 4.924627568162761
RMSLE: 2.2191501905375315
R2: -0.06619324734756082


# popularity_score 로그 스케일로 변환 
- EDA과정에서 로그 변환 후에는 인기도 점수가 중심을 기준으로 대칭적인 종형 곡선을 보임

In [194]:
movie_df['popularity_score_log'] = np.log1p(movie_df['popularity_score'])


from sklearn.model_selection import train_test_split
X = movie_df.drop(['Title','Year','genres_imdb','MPA', 'Rating', 'Votes', 'writers','directors','stars','countries_origin','production_company','Languages','budget_usd','popularity_score', 'popularity_score_log'], axis=1)
y = movie_df['popularity_score_log']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

### 엄청나게 성능이 올라감 1
- R2: -0.07 => 0.18

In [195]:
svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

y_pred_train = svr_rbf.predict(X_train)
y_pred_test = svr_rbf.predict(X_test)

print("<훈련 데이터 점수>")  
eval(y_train, y_pred_train)

print()

print("<평가 데이터 점수")  
eval(y_test, y_pred_test)

<훈련 데이터 점수>
MSE: 3.9968004966888944
RMSE: 1.9991999641578864
MAE: 1.6347207998150806
MSLE: 0.03262831666702482
RMSLE: 0.18063309958871
R2: 0.18921734756072472

<평가 데이터 점수
MSE: 4.0312106823909
RMSE: 2.0077875092725574
MAE: 1.64788819314736
MSLE: 0.03266159502647777
RMSLE: 0.18072519200841308
R2: 0.18268145808177572


# GridSearchSV 적용해보기

In [196]:
from sklearn.model_selection import GridSearchCV

svr = SVR()

params = {
    'C': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'epsilon': [0.01, 0.1, 1]
}
grid = GridSearchCV(svr, params, cv=5)
grid.fit(X_train,y_train)

best_model = grid.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

print("<훈련 데이터 점수>")  
eval(y_train, y_pred_train)

print()

print("<평가 데이터 점수")  
eval(y_test, y_pred_test)

<훈련 데이터 점수>
MSE: 3.9968004966888944
RMSE: 1.9991999641578864
MAE: 1.6347207998150806
MSLE: 0.03262831666702482
RMSLE: 0.18063309958871
R2: 0.18921734756072472

<평가 데이터 점수
MSE: 4.0312106823909
RMSE: 2.0077875092725574
MAE: 1.64788819314736
MSLE: 0.03266159502647777
RMSLE: 0.18072519200841308
R2: 0.18268145808177572


In [197]:
grid.best_params_

{'C': 1, 'epsilon': 0.1, 'kernel': 'rbf'}

최적의 하이퍼파라미터를 찾는 방식과 Cross-Validation을 적용해도 성능이 더 올라가지 않음

# Duration_minute 빼보기
- EDA 과정에서 popularity_score와 상관계수가 매우 낮았으므로

In [None]:
X = X.drop('Duration_minute', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [203]:
svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

y_pred_train = svr_rbf.predict(X_train)
y_pred_test = svr_rbf.predict(X_test)

print("<훈련 데이터 점수>")  
eval(y_train, y_pred_train)

print()

print("<평가 데이터 점수>")  
eval(y_test, y_pred_test)

<훈련 데이터 점수>
MSE: 3.3896675916201846
RMSE: 1.8411049920143567
MAE: 1.438979026580366
MSLE: 0.027696276756333356
RMSLE: 0.16642198399350175
R2: 0.31237906843285057

<평가 데이터 점수>
MSE: 3.7140039753292458
RMSE: 1.9271751283495866
MAE: 1.5399126433515333
MSLE: 0.029953681446301235
RMSLE: 0.1730713189592696
R2: 0.2469943763906115


**결과**: Overfitting이 약간 있지만, 확실히 상관성이 적은 feature를 빼니 성능이 좋아짐