# 랜덤포레스트

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('movies_1980_2025.csv')

In [3]:
df1 = df[['Title','countries_origin', 'Languages', 'Duration_minute',
       'budget_usd', 'genres_imdb', 'popularity_score']]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

X = df1.drop(columns=['Title', 'countries_origin', 'Languages', 'genres_imdb'])
y = df1['popularity_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))



RMSE: 741588613.5059042
R²: 0.9994279260683159


In [5]:
df2 = pd.read_csv('movies_1980_2025.csv')

In [6]:
df2.columns

Index(['Title', 'Year', 'MPA', 'Rating', 'Votes', 'writers', 'directors',
       'stars', 'countries_origin', 'production_company', 'Languages',
       'Duration_minute', 'budget_usd', 'genres_imdb', 'popularity_score'],
      dtype='object')

# XGBoost

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

In [8]:
df2.columns

Index(['Title', 'Year', 'MPA', 'Rating', 'Votes', 'writers', 'directors',
       'stars', 'countries_origin', 'production_company', 'Languages',
       'Duration_minute', 'budget_usd', 'genres_imdb', 'popularity_score'],
      dtype='object')

In [9]:
df2 = df2[[ 'MPA','Duration_minute', 'genres_imdb', 'popularity_score']]
df2

Unnamed: 0,MPA,Duration_minute,genres_imdb,popularity_score
0,PG,124.0,"['Action', 'Adventure', 'Fantasy']",12180000.0
1,PG,127.0,"['Action', 'Adventure']",802400.0
2,PG,109.0,['Comedy'],282900.0
3,R,111.0,"['Comedy', 'Crime']",207700.0
4,PG,88.0,['Comedy'],2094400.0
...,...,...,...,...
21515,TV-14,122.0,"['Comedy', 'Drama', 'Romance']",3995.5
21516,R,89.0,['Thriller'],220.8
21517,TV-MA,100.0,"['Drama', 'Horror', 'Mystery']",453.6
21518,TV-MA,99.0,"['Biography', 'Crime', 'Documentary', 'Music']",2856.7


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df2['MPA'] = le.fit_transform(df2['MPA'].astype(str))  # NaN 처리 포함

# 예: genres_imdb는 리스트 형태라면 MultiLabelBinarizer 사용
from sklearn.preprocessing import MultiLabelBinarizer

# 문자열 → 리스트로 먼저 변환 (예: '["Action", "Drama"]' → ['Action', 'Drama'])
import ast
df2['genres_imdb'] = df2['genres_imdb'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df2['genres_imdb']), columns=mlb.classes_)

df2 = pd.concat([df2.drop('genres_imdb', axis=1), genre_encoded], axis=1)


In [11]:
X = df2.drop('popularity_score', axis=1)
y = df2['popularity_score']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=0)

'''xgb_reg = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=0
)'''

xgb_reg = XGBRegressor(
    n_estimators=600,
    max_depth=4,
    learning_rate=0.04,
    subsample=0.7,
    colsample_bytree=0.9,
    reg_alpha=0.5,
    reg_lambda=1.5,
    gamma=0.1,
    min_child_weight=5,
    random_state=0,
    verbosity=0
)
xgb_reg.fit(X_train, y_train)

y_pred_train = xgb_reg.predict(X_train)
y_pred_test = xgb_reg.predict(X_test)

print("Train RMSE:", mean_squared_error(y_train, y_pred_train))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test))
print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Train R2:", r2_score(y_train, y_pred_train))
print("Test R2:", r2_score(y_test, y_pred_test))

Train RMSE: 682780575077.6217
Test RMSE: 853033548305.3857
Train MAE: 319795.67862841644
Test MAE: 355909.41242509236
Train R2: 0.3881188065646144
Test R2: 0.19685622664281655


# 최고 성능

In [12]:
X = df2.drop('popularity_score', axis=1)
y = df2['popularity_score']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=0)

xgb_reg = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=0
)


xgb_reg.fit(X_train, y_train)

y_pred_train = xgb_reg.predict(X_train)
y_pred_test = xgb_reg.predict(X_test)

print("Train RMSE:", mean_squared_error(y_train, y_pred_train))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test))
print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Train R2:", r2_score(y_train, y_pred_train))
print("Test R2:", r2_score(y_test, y_pred_test))

Train RMSE: 802445402672.8171
Test RMSE: 825559697694.8632
Train MAE: 336500.4334591944
Test MAE: 354922.1002214533
Train R2: 0.28087987769956324
Test R2: 0.2227232656260093


# LightGBM Regressor

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

X = df2.drop('popularity_score', axis=1)
y = df2['popularity_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lgb_reg = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,           
    num_leaves=32,       
    min_child_samples=20,  
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,         
    reg_lambda=1.0,       
    random_state=0
)

lgb_reg.fit(X_train, y_train)

y_pred_train = lgb_reg.predict(X_train)
y_pred_test = lgb_reg.predict(X_test)

print("Train RMSE:", mean_squared_error(y_train, y_pred_train))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test))
print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Train R2:", r2_score(y_train, y_pred_train))
print("Test R2:", r2_score(y_test, y_pred_test))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 16140, number of used features: 24
[LightGBM] [Info] Start training from score 313458.235849
Train RMSE: 766933713487.32
Test RMSE: 820621020146.5897
Train MAE: 328865.94567182584
Test MAE: 352627.2554182345
Train R2: 0.31270406185602995
Test R2: 0.22737310399332133
