# Modelo Final

In [2]:
import pandas as pd

# 1. Cargar el dataset original limpio
df = pd.read_csv('../Modelo/gamesWcategoricas.csv')  # <-- Cambiá esta ruta según corresponda

# 2. Eliminar columnas dummy de idiomas (comienzan con "language_" o "Spoken_")
cols_to_drop = [col for col in df.columns if col.startswith('language_') or col.startswith('Spoken_')]
df = df.drop(columns=cols_to_drop)

# 3. Confirmar
print(f"Columnas eliminadas: {len(cols_to_drop)}")
print(f"Shape resultante del DataFrame: {df.shape}")


  df = pd.read_csv('../Modelo/gamesWcategoricas.csv')  # <-- Cambiá esta ruta según corresponda


Columnas eliminadas: 205
Shape resultante del DataFrame: (111452, 560)


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, precision_score, recall_score, f1_score,
    confusion_matrix, accuracy_score
)
from scipy.stats import ks_2samp

# 1. Cargar dataset
df = pd.read_csv('../Modelo/gamesWcategoricas.csv')

# 2. Crear feature: release_year (desde Release date directamente si ya está limpia)
df['release_year'] = df['Release date']

# 3. Crear feature categórica: price_range
def clasificar_precio(precio):
    if precio == 0:
        return 'free'
    elif precio < 10:
        return 'low'
    elif precio < 30:
        return 'medium'
    else:
        return 'high'
df['price_range'] = df['Price'].apply(clasificar_precio)

# 4. One-hot encoding para price_range
df = pd.get_dummies(df, columns=['price_range'], prefix='price')

# 5. Eliminar columnas dummy de idiomas
cols_to_drop = [col for col in df.columns if col.startswith('language_') or col.startswith('Spoken_')]
df = df.drop(columns=cols_to_drop)

# 6. Separar features y target
X = df.drop(columns=['Estimated owners'])
y = df['Estimated owners']

# 7. Eliminar columnas tipo object que no fueron codificadas
X = X.drop(columns=X.select_dtypes(include='object').columns)

# 8. División en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 10. Entrenamiento del modelo
model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train_scaled, y_train)

# 11. Predicciones
y_pred = model.predict(X_test_scaled)

# 12. Evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))

print("Precision (macro):", round(precision_score(y_test, y_pred, average='macro'), 3))
print("Recall (macro):", round(recall_score(y_test, y_pred, average='macro'), 3))
print("F1 Score (macro):", round(f1_score(y_test, y_pred, average='macro'), 3))

print("Precision (weighted):", round(precision_score(y_test, y_pred, average='weighted'), 3))
print("Recall (weighted):", round(recall_score(y_test, y_pred, average='weighted'), 3))
print("F1 Score (weighted):", round(f1_score(y_test, y_pred, average='weighted'), 3))

print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))

# 13. KS Score
threshold = 100000
y_test_bin = (y_test >= threshold).astype(int)
clase_positiva = model.classes_[np.argmax(model.classes_ >= threshold)]
indice_clase = list(model.classes_).index(clase_positiva)
y_prob = model.predict_proba(X_test_scaled)[:, indice_clase]
ks_score = ks_2samp(y_prob[y_test_bin == 1], y_prob[y_test_bin == 0]).statistic
print("KS Score:", round(ks_score, 3))


  df = pd.read_csv('../Modelo/gamesWcategoricas.csv')


Matriz de Confusión:
[[ 3320  1113     1     0     0     0     0     0     0     0     0     0
      0]
 [  290 13492   174    38     8     5     2     3     1     2     1     0
      0]
 [    1  1360   224    65    20    19     2     1     0     1     0     0
      0]
 [    0   498   158    71    40    21     4     0     0     1     0     0
      0]
 [    0   218   103    74    72    70    11     2     0     1     0     1
      0]
 [    0    80    48    36    73   146    25     9     2     3     0     0
      0]
 [    0    11     7     4    10    75    43    20     7     4     0     0
      0]
 [    0     7     1     1     4    24    26    37    11     4     2     0
      0]
 [    0     0     0     0     0     3     4    21    23     8     0     0
      1]
 [    0     0     0     0     0     1     2     0     7     3     1     0
      0]
 [    0     0     0     0     1     0     0     0     3     0     0     1
      1]
 [    0     0     0     0     0     0     0     0     0     1     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
