Imports & Data Preprocessing


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

path = './data/movies_dataset.csv'

df = pd.read_csv(path)

#We split the dataset intro train and test, and parameters and target
X = df.drop(columns=['Global_BoxOfficeUSD'])
y = df['Global_BoxOfficeUSD']

# Hacer el split ANTES de cualquier transformación
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 20% test
    random_state=42,     # for replicability
    shuffle=True         
)

#We drop the columns that had a high correlation with 'Global_BoxOfficeUSD' as identified during the data cleaning
to_drop = [
    'US_BoxOfficeUSD',       
    'Opening_Day_SalesUSD',  
    'One_Week_SalesUSD'      
]
to_drop = [c for c in to_drop if c in X_train.columns]
if len(to_drop) > 0:
    X_train = X_train.drop(columns=to_drop)
else:
    print("Columns have already been dropped")

#We extract year, month and day from the release date, useful to  detect seasonal patterns
X_train['ReleaseDate'] = pd.to_datetime(X_train['ReleaseDate'], dayfirst=True, errors='coerce')
X_train['ReleaseDate_Year'] = X_train['ReleaseDate'].dt.year
X_train['ReleaseDate_Month'] = X_train['ReleaseDate'].dt.month
X_train['ReleaseDate_Day'] = X_train['ReleaseDate'].dt.day

#Numerical column scaling
cols_to_scale = [
    'BudgetUSD', 
    'NumVotesIMDb', 'NumVotesRT',
    'IMDbRating', 'RottenTomatoesScore',
    'ReleaseDate_Year', 'ReleaseDate_Month', 'ReleaseDate_Day'
]
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

#We also drop some non-numerical columns that are not relevant for specific models
to_drop = [
    'MovieID',       
    'Title',  
    'Director',
    'LeadActor',
    'ReleaseYear', #numerical but duplicated, as we will convert ReleaseDate to separate year, month and day columns
    'ReleaseDate'  #already used, to get the year, month and day columns
]
to_drop = [c for c in to_drop if c in X_train_scaled.columns]
if len(to_drop) > 0:
    X_train_numerical = X_train_scaled.drop(columns=to_drop)
else:
    print("Columns have already been dropped")

#There are not too many different genres/countries, so they can be useful if we one-hot encode them
X_train_numerical = pd.get_dummies(X_train_numerical, columns=['Genre'], prefix='Genre')
X_train_numerical = pd.get_dummies(X_train_numerical, columns=['Country'], prefix='Country')

#We apply the same transformations to the test set

# ReleaseDate → year / month / day
X_test['ReleaseDate'] = pd.to_datetime(X_test['ReleaseDate'], dayfirst=True, errors='coerce')
X_test['ReleaseDate_Year'] = X_test['ReleaseDate'].dt.year
X_test['ReleaseDate_Month'] = X_test['ReleaseDate'].dt.month
X_test['ReleaseDate_Day'] = X_test['ReleaseDate'].dt.day

# Drop same correlated columns
X_test = X_test.drop(columns=to_drop, errors='ignore')

# Scaling (use transform only!)
X_test_scaled = X_test.copy()
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# Drop the same non-numerical columns
X_test_numerical = X_test_scaled.drop(columns=to_drop, errors='ignore')

# One-hot encoding: ensure same columns as train
X_test_numerical = pd.get_dummies(X_test_numerical, columns=['Genre'], prefix='Genre')
X_test_numerical = pd.get_dummies(X_test_numerical, columns=['Country'], prefix='Country')

# Reindex to match train columns
X_test_numerical = X_test_numerical.reindex(columns=X_train_numerical.columns, fill_value=0)

print("Final dataset shape:", X_train_numerical.shape)
X_train_numerical.head()

Final dataset shape: (799999, 26)


Unnamed: 0,BudgetUSD,IMDbRating,RottenTomatoesScore,NumVotesIMDb,NumVotesRT,ReleaseDate_Year,ReleaseDate_Month,ReleaseDate_Day,Genre_Action,Genre_Comedy,...,Country_Australia,Country_Canada,Country_China,Country_France,Country_Germany,Country_India,Country_Japan,Country_South Korea,Country_UK,Country_USA
566853,-0.241545,0.675644,1.148362,-0.190588,0.638026,-0.312189,1.299563,-0.651791,False,True,...,False,False,False,False,True,False,False,False,False,False
382311,-0.047463,-0.939364,-1.068053,-0.353971,-0.200939,-0.571845,-1.312673,-0.651791,False,False,...,True,False,False,False,False,False,False,False,False,False
241519,-0.306311,-1.141241,-0.840729,-0.27684,-0.234123,0.674506,-0.151679,-0.197554,False,False,...,False,False,False,False,False,False,False,False,False,True
930120,-0.347633,0.810228,0.239064,-0.321399,-0.233175,0.622574,-0.151679,0.483803,False,False,...,False,False,False,False,False,False,False,False,False,True
911069,-0.421552,0.271892,-0.272417,-0.348078,-0.197384,0.310987,-1.602922,0.710922,True,False,...,False,False,False,False,False,False,False,False,False,True


RANSAC

In [23]:
ransac = RANSACRegressor(
  LinearRegression(),
  max_trials=100, # default value
  min_samples=0.95,
  residual_threshold=None, # default value
  random_state=123)

ransac.fit(X_train_numerical, y_train)

0,1,2
,estimator,LinearRegression()
,min_samples,0.95
,residual_threshold,
,is_data_valid,
,is_model_valid,
,max_trials,100
,max_skips,inf
,stop_n_inliers,inf
,stop_score,inf
,stop_probability,0.99

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [24]:
y_pred = ransac.predict(X_test_numerical)