In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('../data/processed/processed.csv')
df

Unnamed: 0,year,score,votes,runtime,genre_mapped,director_mapped,star_mapped,votos_por_director,rating_mapped,facutracion_por_director,budget,facturacion_por_genre,gross
0,2000.0,7.9,260000.0,122.0,18.0,50.0,90.0,82822.56,6.0,7.459101e+07,60000000.0,1.093252e+08,47386287.0
1,2000.0,7.6,514000.0,101.0,17.0,50.0,90.0,82822.56,6.0,7.459101e+07,7000000.0,4.433187e+07,34266564.0
2,2000.0,8.3,786000.0,102.0,15.0,50.0,80.0,82822.56,0.0,7.459101e+07,4500000.0,3.893096e+07,7390108.0
3,2000.0,8.4,1200000.0,113.0,13.0,50.0,90.0,82822.56,6.0,7.459101e+07,9000000.0,1.011835e+08,40047078.0
4,2000.0,7.8,542000.0,143.0,18.0,70.0,100.0,149101.61,8.0,1.187342e+08,90000000.0,1.093252e+08,429632142.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2729,2020.0,6.6,140000.0,124.0,19.0,50.0,90.0,82822.56,6.0,7.459101e+07,90000000.0,1.455086e+08,426505244.0
2730,2020.0,6.5,102000.0,99.0,19.0,50.0,90.0,82822.56,10.0,7.459101e+07,85000000.0,1.455086e+08,319715683.0
2731,2020.0,5.6,53000.0,101.0,18.0,70.0,90.0,149101.61,10.0,1.187342e+08,175000000.0,1.093252e+08,245487753.0
2732,2020.0,6.8,42000.0,100.0,18.0,60.0,90.0,102137.26,10.0,8.779391e+07,135000000.0,1.093252e+08,111105497.0


In [4]:
# este jupiter no está ejecutado y no se pueden ver las métricas, se puede ejecutar y funciona perfectametne
# para consultar las metricas consultar en la memoria

# Train Test Split

In [None]:
X = df.drop(['gross'], axis=1)
y = df['gross']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Regresión lineal

In [None]:
pipeline_lnr = Pipeline([
    ('scaler', StandardScaler()),  
    ('regression', LinearRegression())
    ])

parametros_grid = {'regression__fit_intercept': [True, False]}

grid_search = GridSearchCV(pipeline_lnr, parametros_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1)#, n_jobs=-1)

grid_search.fit(X_train, y_train)

mejores_parametros_lnr = grid_search.best_params_

y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mejores hiperparámetros: {mejores_parametros_lnr}")
print(f"MAE: {round(mae, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"R2: {round(r2, 2)}")

# Regresión Polinomial

In [None]:
pipeline_polyr = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('regression', LinearRegression())
])

parametros_grid = {
    'poly__degree': [2, 3, 4], 
    'regression__fit_intercept': [True, False]
}

grid_search = GridSearchCV(pipeline_polyr, parametros_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
mejores_parametros_polyr = grid_search.best_params_
y_pred = grid_search.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mejores hiperparámetros: {mejores_parametros_polyr}")
print(f"MAE: {round(mae, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"R2: {round(r2, 2)}")

# Regresión Polinomial con regularización Ridge

In [None]:
pipeline_ridge = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge()) 
])

parametros_grid = {
    'poly__degree': [2, 3, 4], 
    'ridge__alpha': [0.1, 1.0, 10.0], 
}

grid_search = GridSearchCV(pipeline_ridge, parametros_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
mejores_parametros_ridge = grid_search.best_params_
y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"Mejores hiperparámetros: {mejores_parametros_ridge}")
print(f"MAE: {round(mae, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"R2: {round(r2, 2)}")

# Regresión Polinomial con regularización Lasso

In [None]:
pipeline_lasso = Pipeline([
    ('scaler', StandardScaler()),  
    ('poly', PolynomialFeatures()),  
    ('lasso', Lasso())  
])

parametros_grid = {
    'poly__degree': [2, 3, 4, 5],  
    'lasso__alpha': [0.1, 1.0, 10.0], 
}

grid_search = GridSearchCV(pipeline_lasso, parametros_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
mejores_parametros_lasso = grid_search.best_params_
y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mejores hiperparámetros: {mejores_parametros_lasso}")
print(f"MAE: {round(mae, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"R2: {round(r2, 2)}")

# Regresión Polinomial con Elastic Net

In [None]:
pipeline_en = Pipeline([
    ('scaler', StandardScaler()), 
    ('poly', PolynomialFeatures()), 
    ('elastic_net', ElasticNet())
])

parametros_grid = {
    'poly__degree': [2, 3, 4], 
    'elastic_net__alpha': [0.1, 1.0, 10.0], 
    'elastic_net__l1_ratio': [0.1, 0.5, 0.9]
}

grid_search = GridSearchCV(pipeline_en, parametros_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
mejores_parametros_en = grid_search.best_params_
y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mejores hiperparámetros: {mejores_parametros_en}")
print(f"MAE: {round(mae, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"R2: {round(r2, 2)}")

# Random Forest Regressor

In [None]:
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()), 
    ('rf', RandomForestRegressor(random_state=42))
])

param_grid = {
    'rf__n_estimators': [5, 10, 20],  
    'rf__max_depth': [None, 10, 20],  
}

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_rf = best_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"MAE: {round(mae_rf, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"MSE: {round(mse_rf, 2)}")
print(f"R2: {round(r2_rf, 2)}")

# Gradient Boosting

In [None]:
pipeline_gb = Pipeline([
    ('scaler', StandardScaler()), 
    ('gb', GradientBoostingRegressor(random_state=42))
])


param_grid_gb = {
    'gb__n_estimators': [50, 100, 200], 
    'gb__learning_rate': [0.01, 0.1, 0.2], 
    'gb__max_depth': [3, 5, 7],
}

grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

best_model_gb = grid_search_gb.best_estimator_

y_pred_gb = best_model_gb.predict(X_test)

mae_gb = mean_absolute_error(y_test, y_pred_gb)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Regressor:")
print(f"Best Parameters: {grid_search_gb.best_params_}")
print(f"MAE: {round(mae_gb, 2)}")
print(f"MAPE: {round(mape, 2)} %")
print(f"MSE: {round(mse_gb, 2)}")
print(f"R2: {round(r2_gb, 2)}")

# Regresión Polinomial con regularización Lasso y PCA (no supervisado)

In [None]:
pipeline_lasso_pca = Pipeline([
    ('scaler', StandardScaler()),  
    ('poly', PolynomialFeatures()),  
    ('pca', PCA()), 
    ('lasso', Lasso())  
])

parametros_grid_pca = {
    'poly__degree': [2, 3, 4, 5],  
    'pca__n_components': [None, 2, 5, 10],  
    'lasso__alpha': [0.1, 1.0, 10.0], 
}

grid_search_pca = GridSearchCV(pipeline_lasso_pca, parametros_grid_pca, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search_pca.fit(X_train, y_train)

mejores_parametros_lasso_pca = grid_search_pca.best_params_
y_pred_pca = grid_search_pca.predict(X_test)

mae_pca = mean_absolute_error(y_test, y_pred_pca)
mape_pca = mean_absolute_percentage_error(y_test, y_pred_pca)
mse_pca = mean_squared_error(y_test, y_pred_pca)
r2_pca = r2_score(y_test, y_pred_pca)

print(f"Mejores hiperparámetros con PCA: {mejores_parametros_lasso_pca}")
print(f"MAE con PCA: {round(mae_pca, 2)}")
print(f"MAPE con PCA: {round(mape_pca, 2)} %")
print(f"R2 con PCA: {round(r2_pca, 2)}")

# Red Neuronal

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model = keras.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),  
    keras.layers.Dense(1, activation="linear")
])

model.compile(
    loss="mse",
    optimizer=keras.optimizers.RMSprop(learning_rate=0.001),
    metrics=["mape"]
)

print(model.summary())

early_stopping = keras.callbacks.EarlyStopping(patience=5)

history = model.fit(X_train_scaled, y_train, epochs=1000, validation_split=0.2, callbacks=[early_stopping])

history.history

model.evaluate(X_test_scaled, y_test)