## Modelo de regresión para calcular el precio

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import wandb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
videogames_data = pd.read_csv('data/videogames_eda.csv')
videogames_data.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,languages,release_date,owners_min,owners_max,owners_min_text,owners_max_text,release_year,price_category,total_reviews
0,570,dota 2,Valve,Valve,1777245,389764,"200,000,000 .. 500,000,000",39211,822,0.0,0.0,573082,2013-07-09,200000000,500000000,200M,500M,2013,Gratis o Muy Barato,2167009
1,730,counter-strike: global offensive,Valve,Valve,6892618,982541,"100,000,000 .. 200,000,000",29752,5360,0.0,0.0,1086164,2012-08-21,100000000,200000000,100M,200M,2012,Gratis o Muy Barato,7875159
2,578080,pubg: battlegrounds,"KRAFTON, Inc.","KRAFTON, Inc.",1333940,965634,"50,000,000 .. 100,000,000",24257,6344,0.0,0.0,490082,2017-12-21,50000000,100000000,50M,100M,2017,Gratis o Muy Barato,2299574
3,1063730,new world,Amazon Games,Amazon Games,191896,80619,"50,000,000 .. 100,000,000",10660,3893,37.19,37.19,12707,2021-09-28,50000000,100000000,50M,100M,2021,Caro,272515
4,440,team fortress 2,Valve,Valve,964115,62958,"50,000,000 .. 100,000,000",7732,319,0.0,0.0,85168,2007-10-10,50000000,100000000,50M,100M,2007,Gratis o Muy Barato,1027073


In [3]:
videogames_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012 entries, 0 to 1011
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   appid            1012 non-null   int64  
 1   name             1012 non-null   object 
 2   developer        1012 non-null   object 
 3   publisher        1012 non-null   object 
 4   positive         1012 non-null   int64  
 5   negative         1012 non-null   int64  
 6   owners           1012 non-null   object 
 7   average_forever  1012 non-null   int64  
 8   median_forever   1012 non-null   int64  
 9   price            1012 non-null   float64
 10  initialprice     1012 non-null   float64
 11  languages        1012 non-null   int64  
 12  release_date     1012 non-null   object 
 13  owners_min       1012 non-null   int64  
 14  owners_max       1012 non-null   int64  
 15  owners_min_text  1012 non-null   object 
 16  owners_max_text  1012 non-null   object 
 17  release_year  

In [4]:
videogames_data = videogames_data.loc[videogames_data['release_year'] > 2000]

In [5]:
videogames_data.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,languages,release_date,owners_min,owners_max,owners_min_text,owners_max_text,release_year,price_category,total_reviews
0,570,dota 2,Valve,Valve,1777245,389764,"200,000,000 .. 500,000,000",39211,822,0.0,0.0,573082,2013-07-09,200000000,500000000,200M,500M,2013,Gratis o Muy Barato,2167009
1,730,counter-strike: global offensive,Valve,Valve,6892618,982541,"100,000,000 .. 200,000,000",29752,5360,0.0,0.0,1086164,2012-08-21,100000000,200000000,100M,200M,2012,Gratis o Muy Barato,7875159
2,578080,pubg: battlegrounds,"KRAFTON, Inc.","KRAFTON, Inc.",1333940,965634,"50,000,000 .. 100,000,000",24257,6344,0.0,0.0,490082,2017-12-21,50000000,100000000,50M,100M,2017,Gratis o Muy Barato,2299574
3,1063730,new world,Amazon Games,Amazon Games,191896,80619,"50,000,000 .. 100,000,000",10660,3893,37.19,37.19,12707,2021-09-28,50000000,100000000,50M,100M,2021,Caro,272515
4,440,team fortress 2,Valve,Valve,964115,62958,"50,000,000 .. 100,000,000",7732,319,0.0,0.0,85168,2007-10-10,50000000,100000000,50M,100M,2007,Gratis o Muy Barato,1027073


### Seleccionamos nuestras variables de interés:
positive, negative, average_forever, owners_max, release_year y developer.

Probamos con Ridge y LinearRegression, pero finalmente nos quedamos con GadientBoostingRegressor

# GradientBoostingRegressor

In [6]:
# Definir las columnas numéricas que queremos escalar
variables_to_scale = ['positive', 'negative', 'average_forever', 'owners_max', 'release_year']

# Separar las variables independientes (X) y la variable objetivo (y)
X = videogames_data[['positive', 'negative', 'average_forever', 'owners_max', 'release_year', 'developer']]
y = videogames_data['price']

In [7]:
# Aplicar codificación one-hot a la columna 'developer'
X = pd.get_dummies(X, columns=['developer'], drop_first=True)


In [8]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Crear y ajustar el StandardScaler solo a las columnas numéricas
scaler = StandardScaler()
X_train[variables_to_scale] = scaler.fit_transform(X_train[variables_to_scale])
X_test[variables_to_scale] = scaler.transform(X_test[variables_to_scale])

### Modelo

In [10]:
gb_model_optimized = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=1,
    min_samples_split=4,
    random_state=42
)


### Entrenar modelo

In [11]:
gb_model_optimized.fit(X_train, y_train)


### Evaluar Modelo

In [12]:
y_pred_gb = gb_model_optimized.predict(X_test)


mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting MSE: {mse_gb}")
print(f"Gradient Boosting R2: {r2_gb}")


Gradient Boosting MSE: 129.17886259548408
Gradient Boosting R2: 0.250422932062333


### Exportamos modelo

In [79]:
# with open('regression_price.pkl', 'wb') as file:
#     pickle.dump(gb_model_optimized, file)

### Prueba

In [25]:
# Ejemplo de datos de entrada para un nuevo juego
data = {
    'positive': [30500],  # Número de reseñas positivas
    'negative': [400],   # Número de reseñas negativas
    'average_forever': [6000],  # Tiempo promedio jugado en minutos
    'owners_max': [575000],  # Estimación máxima de propietarios
    'release_year': [2027],  # Año de lanzamiento
    'developer': ['Valve']
    
}

# Convertir el diccionario en un DataFrame de pandas
input_data = pd.DataFrame(data)


In [27]:
# Escalar solo las columnas numéricas
input_data_scaled = scaler.transform(input_data[variables_to_scale])


In [28]:
# Añadir las columnas faltantes de one-hot que están en X_train pero no en input_data, establecidas a 0
missing_cols = set(X_train.columns) - set(input_data.columns)
for c in missing_cols:
    input_data[c] = 0

  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0
  input_data[c] = 0


In [29]:
# Asegurar el mismo orden de columnas
input_data = input_data[X_train.columns]

In [30]:
# Ahora, aplicar el scaler solo a las variables numéricas (aquí asumimos que 'variables_to_scale' ya está definido)
input_data[variables_to_scale] = scaler.transform(input_data[variables_to_scale])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data[variables_to_scale] = scaler.transform(input_data[variables_to_scale])


In [31]:
# Finalmente, realizar la predicción
y_pred_input = gb_model_optimized.predict(input_data)

print(y_pred_input)

[22.89597735]


### Buscamos hiperparámetros óptimos

In [240]:
# Instanciar el modelo
gb_model = GradientBoostingRegressor(random_state=42)

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}


In [241]:
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

In [242]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [243]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor score (MSE negativo):", grid_search.best_score_)

# Mejor modelo
best_gb_model = grid_search.best_estimator_


Mejores hiperparámetros: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300}
Mejor score (MSE negativo): -158.3712700988975


In [244]:
y_pred_best_gb = best_gb_model.predict(X_test)

mse_best_gb = mean_squared_error(y_test, y_pred_best_gb)
r2_best_gb = r2_score(y_test, y_pred_best_gb)

print(f"Mejor Gradient Boosting MSE: {mse_best_gb}")
print(f"Mejor Gradient Boosting R2: {r2_best_gb}")


Mejor Gradient Boosting MSE: 129.17886259548408
Mejor Gradient Boosting R2: 0.250422932062333


# PRUEBAS INICIALES PERO NOS QUEDAMOS CON GRADIENTBOOSTER

# Ridge

### Dividimos

In [169]:
X = videogames_data[['positive', 'negative', 'average_forever', 'owners_max', 'release_year']]
y = videogames_data['price']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Estandarizamos

In [170]:
scaler = StandardScaler()

# Ajustar en el conjunto de entrenamiento
X_train_scaled = scaler.fit_transform(X_train)
# Aplicar la transformación al conjunto de prueba
X_test_scaled = scaler.transform(X_test)


### Entrenamos

In [171]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)


### Evaluamos

In [172]:
y_pred = ridge_model.predict(X_test_scaled)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


Mean Squared Error (MSE): 168.045892167177
R-squared (R2): 0.024891962982452998


### Ajuste hiperparámetros

In [173]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

# Modelo
ridge_model = Ridge()

# Espacio de hiperparámetros
param_grid = {
    'alpha': [0.1, 1, 10, 100, 1000],
}

grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1)

grid_search.fit(X_train_scaled, y_train)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor score (MSE negativo):", grid_search.best_score_)

# Mejor modelo
best_model = grid_search.best_estimator_




Fitting 5 folds for each of 5 candidates, totalling 25 fits
Mejores hiperparámetros: {'alpha': 1000}
Mejor score (MSE negativo): -217.37190818101794


In [174]:
y_pred = best_model.predict(X_test_scaled)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


Mean Squared Error (MSE): 168.43870155521626
R-squared (R2): 0.022612635672791703


### Validación cruzada

In [175]:
# Utilizar el mejor modelo de Ridge
scores = cross_val_score(best_model, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=5)


In [164]:
# Convertir scores a positivo
mse_scores = -scores

# Calcular la media y la desviación estándar de los scores
print("Mean MSE:", mse_scores.mean())
print("Standard deviation:", mse_scores.std())


Mean MSE: 217.37190818101794
Standard deviation: 31.038044768750797


# LinearRegression

### Estandarizamos columnas

In [275]:
scaler = StandardScaler()
variables_to_scale = ['positive', 'negative', 'average_forever', 'owners_max']
videogames_data[variables_to_scale] = scaler.fit_transform(videogames_data[variables_to_scale])


### Asignamos columnas a 'X' e 'y' y hacemos división 80-20

In [276]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar a los datos de entrenamiento y transformarlos
X_train_scaled = scaler.fit_transform(X_train)

# Transformar los datos de prueba usando las mismas estadísticas
X_test_scaled = scaler.transform(X_test)

### Creamos y entrenamos modelo

In [277]:
# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo con el conjunto de entrenamiento
model.fit(X_train, y_train)

# Predecir los valores para el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el modelo
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Error Absoluto Medio (MAE): {mae:.2f}')
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Error Absoluto Medio (MAE): 10.90
Mean Squared Error (MSE): 188.89679584457244
R-squared (R2): -0.09609810403257568


In [112]:
videogames_data.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,languages,release_date,owners_min,owners_max,owners_min_text,owners_max_text,release_year,price_category,total_reviews
0,570,dota 2,Valve,Valve,1777245,389764,"200,000,000 .. 500,000,000",39211,822,0.0,0.0,573082,2013-07-09,200000000,500000000,200M,500M,2013,Gratis o Muy Barato,2167009
1,730,counter-strike: global offensive,Valve,Valve,6892618,982541,"100,000,000 .. 200,000,000",29752,5360,0.0,0.0,1086164,2012-08-21,100000000,200000000,100M,200M,2012,Gratis o Muy Barato,7875159
2,578080,pubg: battlegrounds,"KRAFTON, Inc.","KRAFTON, Inc.",1333940,965634,"50,000,000 .. 100,000,000",24257,6344,0.0,0.0,490082,2017-12-21,50000000,100000000,50M,100M,2017,Gratis o Muy Barato,2299574
3,1063730,new world,Amazon Games,Amazon Games,191896,80619,"50,000,000 .. 100,000,000",10660,3893,37.19,37.19,12707,2021-09-28,50000000,100000000,50M,100M,2021,Caro,272515
4,440,team fortress 2,Valve,Valve,964115,62958,"50,000,000 .. 100,000,000",7732,319,0.0,0.0,85168,2007-10-10,50000000,100000000,50M,100M,2007,Gratis o Muy Barato,1027073


# Modelo de clasificación para predecir si un juego será exitoso o no.

In [7]:
# Crear la columna 'is_successful' basada en el umbral definido
videogames_data['is_successful'] = (videogames_data['owners_max'] > 20000000).astype(int)

# Verificar el resultado
print(videogames_data[['owners_max', 'is_successful']])

      owners_max  is_successful
0      500000000              1
1      200000000              1
2      100000000              1
3      100000000              1
4      100000000              1
...          ...            ...
1007     2000000              0
1008     2000000              0
1009     2000000              0
1010     2000000              0
1011     2000000              0

[1012 rows x 2 columns]


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Definir las características y la variable objetivo
X = videogames_data[['positive', 'negative', 'average_forever', 'price', 'developer']]  # Excluimos 'owners_max' de las características
y = videogames_data['is_successful']


column_transformer = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['developer']),
    ('scaler', StandardScaler(), ['positive', 'negative', 'average_forever', 'price'])
])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)


In [11]:
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9852216748768473
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       198
           1       1.00      0.40      0.57         5

    accuracy                           0.99       203
   macro avg       0.99      0.70      0.78       203
weighted avg       0.99      0.99      0.98       203



Probamos el modelo

In [13]:
new_game_data = {
    'positive': [50000],  # Número de reseñas positivas
    'negative': [5000],   # Número de reseñas negativas
    'average_forever': [2000],  # Tiempo promedio jugado en minutos
    'price': [20],  # Precio del juego
    'developer': ['Amazon Games']  # Desarrollador del juego
}

# Convertir el diccionario a DataFrame
new_game_df = pd.DataFrame(new_game_data)

# Usar el pipeline completo para hacer la predicción
predicted_owners_category = model.predict(new_game_df)

print("La categoría de propietarios predicha es:", predicted_owners_category)

La categoría de propietarios predicha es: [0]


Exportamos modelo

In [15]:
# with open('classsification_success.pkl', 'wb') as file:
#     pickle.dump(model, file)