In [264]:
import pandas as pd
data = pd.read_csv("steam_games.csv")

# Convierto los datos numéricos de la columna "price" de str a float manteniendo los no numéricos igual.
data["price"] = data["price"].str.upper()
data["price"] = pd.to_numeric(data["price"], errors='coerce').fillna(data["price"])

# Convierto los datos de valor "FREE" a "$0.00".
def transform_price(value):
    if isinstance(value, str) and "FREE" in value.upper():
        return 0
    elif isinstance(value, str):
        return pd.NA 
    else:
        return value  
    
data["price"] = data["price"].apply(transform_price)

data = data.dropna(subset=['price'])
data = data.reset_index(drop=True)

# Elimino los outliers de "price"
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

filtro = (data['price'] >= Q1 - 1.5 * IQR) & (data['price'] <= Q3 + 1.5 *IQR)
data = data[filtro]

# Columnas que se van a utilizar
data = data[["price", "genres", "early_access", "metascore", "sentiment", "release_date"]]

# Convierto early access en 1 (Verdadero) y 0 (falso)
data['early_access'] = data['early_access'].astype(int)

# Le asigno la media a los valores faltantes de metascore
media_metascore = data["metascore"].mean()
data["metascore"].fillna(media_metascore, inplace=True)

# Asigno valores a los sentiments. Valores nulos o distintos de los que aparecen aca debajo los completo con "0"
sentiment_map = {
    "Overwhelmingly Negative": -3,
    "Very Negative": -2,
    "Negative": -1,
    "Mixed": 0,
    "Positive": 1,
    "Mostly Positive": 2,
    "Very Positive": 3,
    "Overwhelmingly Positive": 4
}

data['sentiment'] = data['sentiment'].map(sentiment_map, na_action='ignore').fillna(0).astype(int)

# Convierto las fechas de str a datetime, si no es posible se eliminan. Luego extraigo unicamente el año.
def is_convertible(date):
    try:
        pd.to_datetime(date)
        return True
    except:
        return False

data = data[data['release_date'].apply(is_convertible)]


data['release_date'] = pd.to_datetime(data['release_date']).dt.year
data.rename(columns={'release_date': 'year'}, inplace=True)

# Elimino todos los datos vacios.
data = data.dropna()

# Generacion de columnas para cada genero
import re  

data["genres"] = data["genres"].apply(lambda x: re.sub(r'[^\w\s,]','',x.strip()))
data["genres"] = data["genres"].str.title()
genres_data = data["genres"].str.get_dummies(',')
genres_data.columns = genres_data.columns.str.strip()
data = pd.concat([data, genres_data], axis=1)
data = data.drop(["genres"],axis=1)

# Modelo Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
import numpy as np

X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)


model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R^2:", r2)

with open("rmse.txt", "w") as file:
    file.write(str(rmse))


from joblib import dump
dump(model, "linear_model.pkl")
print("Modelo guardado en linear_model.pkl")

X_columns = data.drop('price', axis=1).columns.tolist()

# Guarda las columnas a un archivo json
import json

with open("columns.json", "w") as f:
    json.dump(X_columns, f)

MSE: 23.86110415621824
RMSE: 4.884782918023916
MAE: 3.8445851811236356
R^2: 0.16440173633474497
Modelo guardado en linear_model.pkl
