Imports & Data Preprocessing


In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression

path = './data/movies_dataset.csv'

df = pd.read_csv(path)

#We drop the columns that had a high correlation with 'Global_BoxOfficeUSD' as identified during the data cleaning
to_drop = [
    'US_BoxOfficeUSD',       
    'Opening_Day_SalesUSD',  
    'One_Week_SalesUSD'      
]
to_drop = [c for c in to_drop if c in df.columns]
if len(to_drop) > 0:
    df = df.drop(columns=to_drop)
else:
    print("Columns have already been dropped")

#We extract year, month and day from the release date, useful to  detect seasonal patterns
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'], dayfirst=True, errors='coerce')
df['ReleaseDate_Year'] = df['ReleaseDate'].dt.year
df['ReleaseDate_Month'] = df['ReleaseDate'].dt.month
df['ReleaseDate_Day'] = df['ReleaseDate'].dt.day

#Numerical column scaling
cols_to_scale = [
    'BudgetUSD', 'Global_BoxOfficeUSD', 
    'NumVotesIMDb', 'NumVotesRT',
    'IMDbRating', 'RottenTomatoesScore',
    'ReleaseDate_Year', 'ReleaseDate_Month', 'ReleaseDate_Day'
]
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

#We also drop some non-numerical columns that are not relevant for specific models
to_drop = [
    'MovieID',       
    'Title',  
    'Director',
    'LeadActor',
    'ReleaseYear', #numerical but duplicated, as we will convert ReleaseDate to separate year, month and day columns
    'ReleaseDate'  #already used, to get the year, month and day columns
]
to_drop = [c for c in to_drop if c in df_scaled.columns]
if len(to_drop) > 0:
    df_numerical = df_scaled.drop(columns=to_drop)
else:
    print("Columns have already been dropped")

#There are not too many different genres/countries, so they can be useful if we one-hot encode them
df_numerical = pd.get_dummies(df_numerical, columns=['Genre'], prefix='Genre')
df_numerical = pd.get_dummies(df_numerical, columns=['Country'], prefix='Country')

print("Final dataset shape:", df_numerical.shape)
df_numerical.head()

Final dataset shape: (999999, 27)


Unnamed: 0,BudgetUSD,Global_BoxOfficeUSD,IMDbRating,RottenTomatoesScore,NumVotesIMDb,NumVotesRT,ReleaseDate_Year,ReleaseDate_Month,ReleaseDate_Day,Genre_Action,...,Country_Australia,Country_Canada,Country_China,Country_France,Country_Germany,Country_India,Country_Japan,Country_South Korea,Country_UK,Country_USA
0,-0.143388,-0.168734,-0.198429,-0.385189,-0.051046,1.015503,0.259531,0.719504,1.393737,False,...,False,False,True,False,False,False,False,False,False,False
1,-0.352047,-0.338906,-0.871556,-1.180984,-0.298093,-0.214846,-0.519251,-1.312188,-0.196415,False,...,False,False,False,False,False,False,False,False,False,True
2,-0.326073,-0.288184,-0.669618,-0.555717,-0.178923,0.684556,-1.40187,1.299987,-1.559403,False,...,False,False,False,False,False,False,False,False,False,True
3,-0.371452,-0.328322,0.542009,1.263244,-0.267639,-0.033306,-6.3e-05,0.429262,-1.105074,False,...,False,False,False,False,False,False,False,False,False,True
4,-0.395742,-0.346451,-0.265742,0.126394,-0.143734,-0.158285,1.194069,1.590228,0.144332,False,...,False,False,False,False,False,True,False,False,False,False


RANSAC

In [None]:
ransac = RANSACRegressor(
  LinearRegression(),
  max_trials=100, # default value
  min_samples=0.95,
  residual_threshold=None, # default value
  random_state=123)

#ransac.fit(X, y) tengo que elegir el target...