<a href="https://colab.research.google.com/github/mjxxkxx/Movie-Dataset_Analysis_Project/blob/main/Movie_Rating_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 로드
movies_metadata_path = '/content/drive/MyDrive/dataset_miniproject/movies_metadata.csv'
movies_metadata_df = pd.read_csv(movies_metadata_path, low_memory=False)

# 데이터 전처리
movies_metadata_df['release_date'] = pd.to_datetime(movies_metadata_df['release_date'], errors='coerce')
movies_metadata_df['release_year'] = movies_metadata_df['release_date'].dt.year
movies_metadata_df['budget'] = pd.to_numeric(movies_metadata_df['budget'], errors='coerce')
movies_metadata_df['revenue'] = pd.to_numeric(movies_metadata_df['revenue'], errors='coerce')
movies_metadata_df['popularity'] = pd.to_numeric(movies_metadata_df['popularity'], errors='coerce')

# 결측값 처리: budget과 revenue의 결측값은 0으로 대체
movies_metadata_df['budget'].fillna(0, inplace=True)
movies_metadata_df['revenue'].fillna(0, inplace=True)
movies_metadata_df['popularity'].fillna(movies_metadata_df['popularity'].mean(), inplace=True)

# 필요한 컬럼 선택 및 결측치 제거
features = ['budget', 'revenue', 'popularity', 'runtime', 'release_year']
movies_metadata_df = movies_metadata_df[features + ['vote_average']].dropna()

# 입력(X)과 출력(y) 분리
X = movies_metadata_df[features]
y = movies_metadata_df['vote_average']

# 데이터 분할 (학습용 80%, 테스트용 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링 (Linear Regression 모델에 필요)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_predictions = lr_model.predict(X_test_scaled)

# 2. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# 3. XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# 평가 함수
def evaluate_model(predictions, y_test, model_name):
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    print(f"{model_name} Model Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"R^2 Score: {r2:.4f}\n")

# 모델 평가
evaluate_model(lr_predictions, y_test, "Linear Regression")
evaluate_model(rf_predictions, y_test, "Random Forest")
evaluate_model(xgb_predictions, y_test, "XGBoost")

# 모델 성능 비교 시각화
results = {
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'RMSE': [np.sqrt(mean_squared_error(y_test, lr_predictions)),
             np.sqrt(mean_squared_error(y_test, rf_predictions)),
             np.sqrt(mean_squared_error(y_test, xgb_predictions))],
    'R2 Score': [r2_score(y_test, lr_predictions),
                 r2_score(y_test, rf_predictions),
                 r2_score(y_test, xgb_predictions)]
}

# results_df = pd.DataFrame(results)

# plt.figure(figsize=(12, 6))
# sns.barplot(x='Model', y='RMSE', data=results_df)
# plt.title('Model Performance Comparison (RMSE)')
# plt.ylabel('RMSE')
# plt.show()

# plt.figure(figsize=(12, 6))
# sns.barplot(x='Model', y='R2 Score', data=results_df)
# plt.title('Model Performance Comparison (R^2 Score)')
# plt.ylabel('R^2 Score')
# plt.show()

Linear Regression Model Performance:
RMSE: 1.8469
R^2 Score: 0.0407

Random Forest Model Performance:
RMSE: 1.5244
R^2 Score: 0.3464

XGBoost Model Performance:
RMSE: 1.4577
R^2 Score: 0.4024

