In [21]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/movies_with_roi_features.csv')
df["ROI"] = np.log1p(df["ROI"])
display(df.head())


df = df.drop(columns=["Budget","Gross Worldwide"])

Unnamed: 0,Title,Rating,Popularity,Budget,Gross Worldwide,URL,Runtime_minutes,ROI,Director_main,Writer_main,Star_main,Genre_main
0,Închisoarea îngerilor,9.3,55.0,25000000.0,29334030.0,https://www.imdb.com/title/tt0111161/,142,0.776275,Frank Darabont,Stephen King,Tim Robbins,Epic
1,Cavalerul negru,9.1,92.0,185000000.0,1009243000.0,https://www.imdb.com/title/tt0468569/,152,1.864912,Christopher Nolan,Jonathan Nolan,Christian Bale,Action Epic
2,Începutul,8.8,84.0,160000000.0,839786500.0,https://www.imdb.com/title/tt1375666/,148,1.832368,Christopher Nolan,Christopher Nolan,Leonardo DiCaprio,Action Epic
3,Fight Club - Sala de lupte,8.8,148.0,63000000.0,101321000.0,https://www.imdb.com/title/tt0137523/,139,0.958687,David Fincher,Chuck Palahniuk,Edward Norton,Dark Comedy
4,Pulp Fiction,8.8,141.0,8000000.0,213928800.0,https://www.imdb.com/title/tt0110912/,154,3.322915,Quentin Tarantino,Quentin Tarantino,Tim Roth,Dark Comedy


# Task
Build and evaluate a CatBoost Regressor model to predict 'ROI' using the data from `/content/movies_with_roi_features.csv`.

## Prepare Data for CatBoost

### Subtask:
Separate the features (X) from the target variable (y), which is 'ROI'. Identify categorical features and split the data into training and testing sets.


**Reasoning**:
To prepare the data for CatBoost, I will separate features (X) and target (y), identify categorical columns, and then split the data into training and testing sets as per the instructions.



In [22]:
from sklearn.model_selection import train_test_split

# 1. Define the feature matrix X and the target vector y
X = df.drop('ROI', axis=1)
y = df['ROI']

# 2. Identify categorical features
categorical_features_indices = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Identified {len(categorical_features_indices)} categorical features: {categorical_features_indices}")

# 3. Split the X and y data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Identified 6 categorical features: ['Title', 'URL', 'Director_main', 'Writer_main', 'Star_main', 'Genre_main']
X_train shape: (7798, 9)
X_test shape: (1950, 9)
y_train shape: (7798,)
y_test shape: (1950,)


## Train CatBoost Regressor

### Subtask:
Initialize and train a CatBoost Regressor model using the training data.


**Reasoning**:
The subtask requires initializing and training a CatBoost Regressor model. This involves importing the necessary class, instantiating the model with specified parameters for reproducibility and categorical features, and then training it using the prepared training data.



In [23]:
from catboost import CatBoostRegressor

# 1. Instantiate a CatBoostRegressor model
# 2. Pass the random_seed=42 parameter for reproducibility and set the cat_features parameter to the categorical_features_indices list.
cat_model = CatBoostRegressor(random_seed=42,
                              cat_features=categorical_features_indices,
                              verbose=0) # verbose=0 to suppress training output

print("CatBoostRegressor model initialized.")

# 3. Train the CatBoostRegressor model using the X_train and y_train datasets
cat_model.fit(X_train, y_train, cat_features=categorical_features_indices)

print("CatBoostRegressor model trained successfully.")

CatBoostRegressor model initialized.
CatBoostRegressor model trained successfully.


## Evaluate Model Performance

### Subtask:
Evaluate the trained CatBoost model's performance on the test set using relevant regression metrics (e.g., R-squared, Mean Absolute Error, Mean Squared Error).


**Reasoning**:
To evaluate the trained CatBoost model, I will first make predictions on the test set, then calculate and print the R-squared score, Mean Absolute Error (MAE), and Mean Squared Error (MSE) using `sklearn.metrics`.



In [24]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# 1. Use the trained cat_model to make predictions on the X_test dataset.
y_pred = cat_model.predict(X_test)

# 2. Calculate the R-squared score, Mean Absolute Error (MAE), and Mean Squared Error (MSE)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# 3. Print the calculated R-squared score, MAE, and MSE.
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")

R-squared (R2) score: 0.2941
Mean Absolute Error (MAE): 0.4955
Mean Squared Error (MSE): 0.5331
