<a href="https://colab.research.google.com/github/EfeImrek/movie-sentiment-analysis/blob/main/analysis/ml_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Machine Learning: IMDb Score Prediction (Regression)

## Problem Definition
In this stage of the project, we formulate a supervised machine learning regression problem.
The goal is to predict a movie's IMDb score using basic movie metadata such as budget, gross revenue,
release year, and genre.

The target variable is:
- imdb_score

The input features include:
- budget
- gross
- title_year
- main_genre

This analysis aims to evaluate whether simple regression models can capture meaningful
relationships between movie metadata and IMDb ratings.


In [10]:
from google.colab import files
import pandas as pd

uploaded = files.upload()  # movie_metadata.csv seçip yükle

df = pd.read_csv("movie_metadata.csv")
print(df.shape)
df.head()


Saving movie_metadata.csv to movie_metadata (1).csv
(5043, 28)


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [11]:
# Select relevant columns
use_cols = ["imdb_score", "budget", "gross", "title_year", "genres"]
df_ml = df[use_cols].copy()

# Extract main genre (first genre)
df_ml["main_genre"] = df_ml["genres"].astype(str).str.split("|").str[0]

# Drop the original genres column
df_ml = df_ml.drop(columns=["genres"])

print(df_ml.head())
print(df_ml.isna().sum())


   imdb_score       budget        gross  title_year   main_genre
0         7.9  237000000.0  760505847.0      2009.0       Action
1         7.1  300000000.0  309404152.0      2007.0       Action
2         6.8  245000000.0  200074175.0      2015.0       Action
3         8.5  250000000.0  448130642.0      2012.0       Action
4         7.1          NaN          NaN         NaN  Documentary
imdb_score      0
budget        492
gross         884
title_year    108
main_genre      0
dtype: int64


In [12]:
# Drop rows with missing title_year
df_ml = df_ml.dropna(subset=["title_year"])

# Fill numerical columns with median
df_ml["budget"] = df_ml["budget"].fillna(df_ml["budget"].median())
df_ml["gross"] = df_ml["gross"].fillna(df_ml["gross"].median())

print(df_ml.shape)
print(df_ml.isna().sum())


(4935, 5)
imdb_score    0
budget        0
gross         0
title_year    0
main_genre    0
dtype: int64


In [13]:
from sklearn.model_selection import train_test_split

# Feature matrix (X) and target vector (y)
X = df_ml.drop(columns=["imdb_score"])
y = df_ml["imdb_score"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(X_train.shape, X_test.shape)


(3948, 4) (987, 4)


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Identify column types
numeric_features = ["budget", "gross", "title_year"]
categorical_features = ["main_genre"]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

# Pipeline
linreg_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

# Train
linreg_model.fit(X_train, y_train)

# Predict
y_pred = linreg_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Linear Regression Results")
print("MAE :", round(mae, 4))
print("RMSE:", round(rmse, 4))
print("R2  :", round(r2, 4))


Linear Regression Results
MAE : 0.8396
RMSE: 1.0787
R2  : 0.024


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.pipeline import Pipeline

rf_model = Pipeline(steps=[
    ("preprocess", preprocessor),  # aynı preprocessor: one-hot + numeric passthrough
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest Results")
print("MAE :", round(rf_mae, 4))
print("RMSE:", round(rf_rmse, 4))
print("R2  :", round(rf_r2, 4))


Random Forest Results
MAE : 0.7279
RMSE: 0.9666
R2  : 0.2163
