In [1]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from sklearn import linear_model

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Metricas de clasificación
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedKFold



In [2]:
dataset = pd.read_csv("database_final2.csv")
dataset.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Director_Name,Runtime,Genres,Movie_Title,Plot_Keywords,Content_Rating,Budget,Aspect_Ratio,...,Director_Ratio,Keywords_Avg_Revenue,Keywords_Ratio,Content_Rating_Score,Studios_Avg_Movie_Revenue,Studios_Ratio,Lead_Actor_Avg_Movie_Revenue,Lead_Actor_Movie_Count,Lead_Actor_Ratio,Class
0,0,0,James Cameron,154.0,Action|Adventure|Fantasy|Sci-Fi,Avatar,avatar|future|marine|native|paraplegic,PG-13,105500000.0,1.78,...,7.07,218697000.0,5.18,2.0,310095700.0,4.79,338393100.0,4.0,6.08,1
1,1,1,Gore Verbinski,154.0,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,PG-13,105500000.0,2.35,...,2.97,218697000.0,3.07,2.0,310095700.0,3.23,266429200.0,21.0,2.82,1
2,2,2,Christopher Nolan,154.0,Action|Thriller,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,PG-13,105500000.0,2.35,...,4.21,218697000.0,4.12,2.0,310095700.0,3.22,272686500.0,13.0,3.3,1
3,3,3,Andrew Stanton,132.0,Action|Adventure|Sci-Fi,John Carter,alien|american civil war|male nipple|mars|prin...,PG-13,105500000.0,2.35,...,3.21,184061200.0,1.66,2.0,256171900.0,3.04,293582300.0,2.0,1.24,1
4,4,4,Sam Raimi,154.0,Action|Adventure|Romance,Spider-Man 3,sandman|spider man|symbiote|venom|villain,PG-13,105500000.0,2.35,...,3.78,218697000.0,4.09,2.0,310095700.0,2.88,338393100.0,5.0,3.86,1


In [3]:
# seleccionamos las variables que correlacionan con Class
features = ['Director_Avg_Movie_Revenue', 'Studios_Avg_Movie_Revenue','Keywords_Avg_Revenue', 'Lead_Actor_Avg_Movie_Revenue', 'Budget']


# target para la y
target = pd.DataFrame(dataset.pop('Class'), columns=['Class'])

In [4]:
dataset = dataset[features]

X = dataset.values
y = np.ravel(target.values) #unidimensión del target


Ejecución de GridSearchCV para conocer qué modelo de machine learning es el mejor para mis datos

Elección de hiperparámetros

In [None]:
# REGRESION LOGISTICA
grid_logreg = {                   
                     "penalty": ["l1","l2"], # Regularizaciones L1 y L2.
                     "C": [0.1, 0.5, 1.0, 5.0], # Cuanta regularizacion queremos
                     
                     "max_iter": [50,100,500],  # Iteraciones del Gradient Descent
                     
                     "solver": ["liblinear"]  
                    }


# KNN
grid_neighbors = {"n_neighbors": [3,5,7,9],       
                  "weights": ["uniform","distance"]  # Ponderar o no las clasificaciones en 
                                                     # función de la inversa de la distancia a cada vecino
                  }

# ARBOL DE DECISION
grid_arbol = {"max_depth":list(range(1,5)) # Profundidades del árbol. Cuanto más profundo, mas posibilidades de overfitting,
                                            # pero  mas preciso en entrenamiento.
              }

# RANDOM FOREST
grid_random_forest = {"n_estimators": [120], # El Random Forest no suele empeorar por exceso de
                                             # estimadores. A partir de cierto numero no merece la pena
                                             # perder el tiempo ya que no mejora mucho más la precisión.
                                             # Entre 100 y 200 es una buena cifra

                     
                     "max_depth": [3,4,5,6,10], # No le afecta tanto el overfitting como al decissiontree.
                                                      # Podemos probar mayores profundidades
                      
                     "max_features": ["sqrt", 3, 4] # Numero de features que utiliza en cada split.
                                                    # cuanto más bajo, mejor generalizará y menos overfitting.
                                                    
                     }


# SVM
grid_svm = {"C": [0.01, 0.1, 0.3, 0.5, 1.0, 3, 5.0, 15, 30], # Parametro de regularizacion
            "kernel": ["linear","rbf"], # Tipo de kernel, probar varios
            "gamma": [0.001, 0.1, "auto", 1.0, 10.0, 30.0] # Coeficiente de regulaizacion para los kernels
           }
           
# GRADIENT BOOSTING
grid_gradient_boosting = {"loss": ["deviance"], # Deviance suele ir mejor.
                          "learning_rate": [0.05, 0.1, 0.2, 0.4, 0.5],  # Cuanto más alto, mas aporta cada nuevo arbol
                          
                          "n_estimators": [20,50,100,200], # Cuidado con poner muchos estiamdores ya que vamos a
                                                           # sobreajustar el modelo
                          
                          "max_depth": [1,2,3,4,5], # No es necesario poner una profundiad muy alta. Cada nuevo
                                                    # arbol va corrigiendo el error de los anteriores.
                          
                          
                          "max_features": ["sqrt", 3, 4], # Igual que en el random forest
                          }



Pipelines

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Si solo es el modelo, no hará falta meterlo en un pipeline

rand_forest = RandomForestClassifier()

svm = Pipeline([("scaler",MinMaxScaler()),
                ("selectkbest",SelectKBest()),
                ("svm",SVC())
               ])


reg_log = Pipeline([("imputer",SimpleImputer()),
                    ("scaler",MinMaxScaler()),
                    ("reglog",LogisticRegression())
                   ])


grid_random_forest = {"n_estimators": [120],
                     "max_depth": [3,4,5,6,10],
                     "max_features": ["sqrt", 3, 4]                          
                     }


svm_param = {                    
            'selectkbest__k': [1,2,3],
            'svm__C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
            'svm__kernel': ["linear","rbf"],
            'svm__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
            'svm__gamma': ('scale', 'auto')
            }


reg_log_param = {    
                 "imputer__strategy": ['mean', 'median', 'most_frequent'],
                 "reglog__penalty": ["l1","l2"], 
                 "reglog__C": np.logspace(0, 4, 10)
                }

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
# Almaceno en una lista de tuplas los modelos (nombre que le pongo, el modelo, hiperparametros)
models = [('rand_forest', rand_forest, grid_random_forest),
         ('svm', svm, svm_param),
         ('reg_log', reg_log, reg_log_param)]

# Declaro en un diccionario los pipelines e hiperparametros
models_gridsearch = {}

for i in models:
    models_gridsearch[i[0]] = GridSearchCV(i[1],
                                          i[2],
                                          cv=10,
                                          scoring="accuracy",
                                          verbose=1,
                                          n_jobs=-1)
    
    models_gridsearch[i[0]].fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits
Fitting 10 folds for each of 768 candidates, totalling 7680 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


300 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\b2bch\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\b2bch\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\b2bch\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  Fil

No he logrado solucionar los errores aparecidos. Pese a esto, se ha obtenido un resultado: Random Forest es el modelo que mejor se ajusta a mis datos

In [None]:
#Resultado de los mejores modelos
best_grids = [(i, j.best_score_) for i, j in models_gridsearch.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
0,rand_forest,0.884135
1,svm,0.795689
2,reg_log,0.794949


In [None]:
models_gridsearch['rand_forest'].best_estimator_ #mejor estimador para random forest

RandomForestClassifier(max_depth=10, max_features=3, n_estimators=120)

In [None]:
# La probamos en test
models_gridsearch['rand_forest'].best_estimator_.score(X_test, y_test)

0.891970802919708

In [None]:

# Guardar el modelo
import pickle

with open('finished_model.model', "wb") as archivo_salida:
    pickle.dump(models_gridsearch['rand_forest'].best_estimator_, archivo_salida)