# **EDA y ML**

---


Realizamos un Exploratory Data Analysis (EDA) para conocer en profundidad los datos y realizar del modelo de Machine Learning (ML).

Importamos las librerias necesarias para el desarrollo de el EDA y modelo de ML

In [53]:
import numpy as np
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

In [54]:
df_filtrado = pd.read_csv(r'steam_games_ETL.csv')
df_filtrado.head(5)

Unnamed: 0,genres,release_date,tags,specs,price,sentiment,metascore
0,['Action'],1998-11-08,"['FPS', 'Classic', 'Action', 'Sci-fi', 'Single...","['Single-player', 'Multi-player', 'Valve Anti-...",9.99,Overwhelmingly Positive,96
1,['Strategy'],2006-07-06,"['Turn-Based Strategy', 'Strategy', 'Classic',...","['Single-player', 'Multi-player', 'Co-op']",6.99,Mostly Positive,84
2,['Strategy'],2006-07-06,"['Strategy', 'Turn-Based Strategy', 'Fantasy',...","['Single-player', 'Multi-player', 'Co-op']",6.99,Very Positive,80
3,"['Action', 'Indie', 'RPG']",2006-07-11,"['Multiplayer', 'Indie', 'Action', 'First-Pers...","['Single-player', 'Multi-player', 'Steam Tradi...",9.99,Very Positive,76
4,['Action'],2005-08-09,"['Action', 'Sci-fi', 'Story Rich', 'Singleplay...","['Single-player', 'Steam Trading Cards']",9.99,Mostly Positive,70


Contamos la cantidad de valores únicos de las columnas que tienen múltiples datos en listas, para verificar cual nos podría servir para convertirla en columnas binarias.

La que usaremos en este caso será géneros , ya que contiene solo 12 valores únicos que es una cantidad razonable para utilizar y eliminaremos specs y tags.


In [55]:
df_filtrado['genres'] = df_filtrado['genres'].apply(eval)
df_filtrado['specs'] = df_filtrado['specs'].apply(eval)
df_filtrado['tags'] = df_filtrado['tags'].apply(eval)

count_genres = len(df_filtrado['genres'].explode().unique())
count_tags = len(df_filtrado['tags'].explode().unique())
count_specs = len(df_filtrado['specs'].explode().unique())

print(f"Cantidad de valores únicos en 'genres': {count_genres}")
print(f"Cantidad de valores únicos en 'tags': {count_tags}")
print(f"Cantidad de valores únicos en 'specs': {count_specs}")

Cantidad de valores únicos en 'genres': 12
Cantidad de valores únicos en 'tags': 320
Cantidad de valores únicos en 'specs': 27


In [56]:
df_filtrado.drop(columns=['tags','specs'], inplace=True)

La columna genero la convertimos en 12 columnas binarias

In [57]:
df_filtrado.dropna(subset=['genres'], inplace=True)
# Obtener la lista completa de géneros presentes en el DataFrame
all_genres = set()
for genres_list in df_filtrado['genres']:
    if isinstance(genres_list, list):  # Verificar si es una lista antes de iterar
        all_genres.update(genres_list)

# Crear columnas binarias para cada género
for genre in all_genres:
    df_filtrado[genre] = df_filtrado['genres'].apply(lambda genres_list: 1 if isinstance(genres_list, list) and genre in genres_list else 0)

# Eliminar la columna original de géneros
df_filtrado.drop(columns=['genres'], inplace=True)

# Ahora el DataFrame df_filtrado tiene las columnas binarias para cada género, listas para usar en la regresión

Particionamos la fecha en día , mes y año para ver realmente cual tiene más relación con el precio y borramos release_date

In [58]:
# Convertimos la columna "release_date" a un formato de fecha
df_filtrado['release_date'] = pd.to_datetime(df_filtrado['release_date'])

# Extraemos el año creando una nueva columna year
df_filtrado['year'] = df_filtrado['release_date'].dt.year
df_filtrado['month'] = df_filtrado['release_date'].dt.month
df_filtrado['day'] = df_filtrado['release_date'].dt.day


# Eliminamos la columna original "release_date"
df_filtrado.drop('release_date', axis=1, inplace=True)


Asignamos un numero a cada sentimiento de manera ascendente para que se mas facil vizualizar la relacion con el precio

In [59]:
rating_mapping = {
    'Overwhelmingly Negative': 0,
    'Very Negative': 1,
    'Negative': 2,
    'Mostly Negative': 3,
    'Mixed': 4,
    'Mostly Positive': 5,
    'Positive': 6,
    'Very Positive': 7,
    'Overwhelmingly Positive': 8
}
df_filtrado['sentiment'] = df_filtrado['sentiment'].replace(rating_mapping)
df_filtrado['sentiment'] = pd.to_numeric(df_filtrado['sentiment'])
df_filtrado['metascore'] = df_filtrado['metascore'].astype(int)

In [60]:
df_filtrado

Unnamed: 0,price,sentiment,metascore,Racing,RPG,Strategy,Free to Play,Adventure,Action,Casual,Simulation,Massively Multiplayer,Indie,Early Access,Sports,year,month,day
0,9.99,8,96,0,0,0,0,0,1,0,0,0,0,0,0,1998,11,8
1,6.99,5,84,0,0,1,0,0,0,0,0,0,0,0,0,2006,7,6
2,6.99,7,80,0,0,1,0,0,0,0,0,0,0,0,0,2006,7,6
3,9.99,7,76,0,1,0,0,0,1,0,0,0,1,0,0,2006,7,11
4,9.99,5,70,0,0,0,0,0,1,0,0,0,0,0,0,2005,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2411,4.99,7,71,0,0,0,0,0,1,0,0,0,0,0,0,2001,6,1
2412,14.99,8,88,0,0,0,0,1,1,0,0,0,0,0,0,2002,8,28
2413,4.99,7,79,0,0,0,0,0,1,0,0,0,0,0,0,2003,5,1
2414,14.99,8,93,0,0,0,0,0,1,0,0,0,0,0,0,2004,3,16


Vemos la correlacion de las variables con el precio

In [61]:
df_filtrado.corr()

Unnamed: 0,price,sentiment,metascore,Racing,RPG,Strategy,Free to Play,Adventure,Action,Casual,Simulation,Massively Multiplayer,Indie,Early Access,Sports,year,month,day
price,1.0,0.010211,0.152837,0.06849,0.040074,0.036647,-0.242948,-0.012192,0.014828,-0.140386,0.090925,-0.123851,-0.164301,-0.013236,0.16611,0.336382,0.029678,0.005164
sentiment,0.010211,1.0,0.518964,-0.01555,0.029081,-0.127298,-0.06835,0.056666,0.063881,0.044977,-0.11255,-0.068496,0.148055,-0.007985,-0.042706,0.020809,0.034342,-0.017505
metascore,0.152837,0.518964,1.0,-0.011243,0.04683,0.014435,0.012737,-0.045651,-0.022678,-0.022454,-0.056972,0.024433,-0.082079,0.019179,0.014236,-0.113248,0.036021,0.01399
Racing,0.06849,-0.01555,-0.011243,1.0,-0.093918,-0.095345,-0.032695,-0.107168,-0.060375,-0.015686,0.064287,-0.010419,-0.084543,-0.00553,0.353052,0.008089,0.014104,-0.020536
RPG,0.040074,0.029081,0.04683,-0.093918,1.0,0.012279,0.10075,-0.028815,-0.041775,-0.077783,-0.056372,0.162174,0.015502,-0.014071,-0.084442,0.088985,0.01204,0.035752
Strategy,0.036647,-0.127298,0.014435,-0.095345,0.012279,1.0,0.022269,-0.321369,-0.322234,-0.048445,0.238025,-0.001532,-0.09442,0.015266,-0.025703,-0.074021,-0.028959,0.021952
Free to Play,-0.242948,-0.06835,0.012737,-0.032695,0.10075,0.022269,1.0,-0.060149,0.040978,-0.03748,-0.006836,0.617218,-0.066847,0.08212,-0.014476,0.034283,0.001853,-0.029558
Adventure,-0.012192,0.056666,-0.045651,-0.107168,-0.028815,-0.321369,-0.060149,1.0,-0.032495,0.063426,-0.187395,-0.041637,0.230027,0.008335,-0.129813,0.203466,0.007467,0.007294
Action,0.014828,0.063881,-0.022678,-0.060375,-0.041775,-0.322234,0.040978,-0.032495,1.0,-0.11683,-0.198694,0.016969,0.002445,0.029802,-0.068065,0.012313,-0.003415,0.014778
Casual,-0.140386,0.044977,-0.022454,-0.015686,-0.077783,-0.048445,-0.03748,0.063426,-0.11683,1.0,0.0183,-0.044755,0.16656,-0.009247,-0.004644,0.038163,-7.5e-05,-0.032583


Eliminare sentiment, month y day ya que no tienen una correlación significativa y agregarlos implicaría insertar mas datos en el input

In [62]:
df_filtrado.drop(['sentiment','month','day'], inplace=True, axis=1)

En este caso utilizaremos el Modelo de Machine Learning de regresión Lineal polinómica ya que queremos predecir un variable cuantitativa continua 

In [63]:
# Seleccionar las variables independientes (predictores) y la variable dependiente (precio)
y = df_filtrado['price']
X = df_filtrado.drop(columns=['price'])

# Dividir el conjunto de datos en datos de entrenamiento y datos de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear características polinómicas de grado 2
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Crear y entrenar el modelo de regresión lineal con características polinómicas
poly_regression_model = LinearRegression()
poly_regression_model.fit(X_train_poly, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred_poly = poly_regression_model.predict(X_test_poly)

# Calcular el Mean Squared Error (MSE) y el Root Mean Squared Error (RMSE)
mse_poly = mean_squared_error(y_test, y_pred_poly)
rmse_poly = (mse_poly ** 0.5)

print(f"Mean Squared Error (Polynomial Regression): {mse_poly}")
print(f"Root Mean Squared Error (Polynomial Regression): {rmse_poly}")



Mean Squared Error (Polynomial Regression): 52.058911778111344
Root Mean Squared Error (Polynomial Regression): 7.215186191506866


Creamos el archivo para leerlo en el Main

In [64]:
df_filtrado.to_csv('steam_games_model.csv', index=False)

Crear Funcion para predecir el precio

In [68]:
def prediccion(year: int, generos: str, metascore: int):

    input_genres = generos.split(',')
    available_genres = ['Indie', 'Early Access', 'Massively Multiplayer', 'Strategy', 'RPG', 'Action', 'Casual', 'Free to Play', 'Racing', 'Adventure', 'Simulation', 'Sports']


    # Crear un nuevo DataFrame 'X_new' con las características para el nuevo producto
    new_data = {
        'metascore': [metascore],
        'year': [year]
    }

    for genre in available_genres:
        new_data[genre] = [1 if genre in input_genres else 0]

    X_new = pd.DataFrame(new_data)

    # Verificar si el género es "Free to Play", en cuyo caso el precio predicho será 0
    if 'Free to Play' in input_genres:
        return {
            "Precio": 0,
            "RMSE del Modelo": round(rmse_poly, 2)
        }

    # Asegurarse de que las columnas en X_new tengan el mismo orden que en X_train
    X_new = X_new[X_train.columns]

    # Generar las características polinómicas para X_new usando el mismo objeto 'poly' que se utilizó en el entrenamiento
    X_new_poly = poly.transform(X_new)

    # Realizar la predicción de precios para X_new utilizando el modelo de regresión lineal con características polinómicas
    y_pred_new = poly_regression_model.predict(X_new_poly)[0]

    # Retornar el resultado de la predicción
    return {
        "Precio": round(y_pred_new, 2),
        "RMSE del Modelo": round(rmse_poly, 2)
    }