In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

# Preproc - Steam Reviews 2021
## Configuración general

In [2]:
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

## 1. Obtención y carga inicial del dataset

In [3]:
df = pd.read_csv('/kaggle/input/steam-reviews-2021/steam_reviews.csv', low_memory=False)
print('Shape:', df.shape)
print('\\nDtypes:\\n', df.dtypes)
print('\\nHead:\\n', df.head())

Shape: (21747371, 23)
\nDtypes:\n Unnamed: 0                          int64
app_id                              int64
app_name                           object
review_id                           int64
language                           object
review                             object
timestamp_created                   int64
timestamp_updated                   int64
recommended                          bool
votes_helpful                       int64
votes_funny                         int64
weighted_vote_score               float64
comment_count                       int64
steam_purchase                       bool
received_for_free                    bool
written_during_early_access          bool
author.steamid                      int64
author.num_games_owned              int64
author.num_reviews                  int64
author.playtime_forever           float64
author.playtime_last_two_weeks    float64
author.playtime_at_review         float64
author.last_played                float64


## 2. Inspección preliminar

In [5]:
print('\nNulos:\n', df.isnull().sum())
print('\nDuplicados:', df.duplicated().sum())
print('\nRecommended:\n', df['recommended'].value_counts(normalize=True))
print('\nIdiomas top 10:\n', df['language'].value_counts().head(10))


Nulos:
 Unnamed: 0                            0
app_id                                0
app_name                              0
review_id                             0
language                              0
review                            33748
timestamp_created                     0
timestamp_updated                     0
recommended                           0
votes_helpful                         0
votes_funny                           0
weighted_vote_score                   0
comment_count                         0
steam_purchase                        0
received_for_free                     0
written_during_early_access           0
author.steamid                        0
author.num_games_owned                0
author.num_reviews                    0
author.playtime_forever               2
author.playtime_last_two_weeks        2
author.playtime_at_review         25682
author.last_played                    2
dtype: int64

Duplicados: 0

Recommended:
 recommended
True     0.87470

## 3. Limpieza de datos

In [8]:
# Carga SOLO lo que necesitas
cols = [
    'app_id',
    'app_name',
    'author.steamid',
    'recommended',
    'language',
    'author.playtime_forever',
    'author.playtime_at_review',
    'votes_helpful',
    'weighted_vote_score'
]

# FILTRAR columnas relevantes
df_f = df[cols]

# Filtros CRÍTICOS
df_f = df_f.loc[df_f['language'] == 'english']                        # solo reseñas en inglés
df_f = df_f.loc[df_f['author.playtime_forever'] >= 10]            # jugadores con >10 min
df_f = df_f.loc[df_f['votes_helpful'] > 0]                        # reseñas con votos útiles

print("Shape tras filtros críticos:", df_f.shape)

# Nulos SIN BORRAR FILAS
df_f['author.playtime_at_review'] = df_f['author.playtime_at_review'].fillna(0)
df_f['weighted_vote_score'] = df_f['weighted_vote_score'].fillna(0)

# Etiqueta simple para KNN
# CREAR FEATURE "rating"
# rating = 0 si no recomienda, o (weighted_vote_score+1) si recomienda
df_f['rating'] = df_f['recommended'].astype(int) * (df_f['weighted_vote_score'] + 1)

# NORMALIZAR PLAYTIME (opcional, mejora similitud en KNN)
# Escalamos logarítmicamente para reducir sesgo de jugadores con miles de horas
df_f['playtime_norm'] = (df_f['author.playtime_forever'] + 1).apply(lambda x: np.log1p(x))

# CSV FINAL ULTRA-LIGERO
final = df_f[['app_id','author.steamid','author.playtime_forever','rating']]
print(f"Dataset final listo con {final.shape[0]} filas y {final.shape[1]} columnas")
#final.to_csv('/kaggle/working/steam_knn_ready.csv', index=False)

Shape tras filtros críticos: (2806669, 9)
Dataset final listo con 2806669 filas y 4 columnas


In [9]:
final.to_csv('/kaggle/working/steam_knn_ready.csv', index=False)

In [10]:
print('\nNulos:\n', df_f.isnull().sum())
print('\nDuplicados:', df_f.duplicated().sum())
print('\nRecommended:\n', df_f['recommended'].value_counts(normalize=True))
print('\nIdiomas top 10:\n', df_f['language'].value_counts().head(10))


Nulos:
 app_id                       0
app_name                     0
author.steamid               0
recommended                  0
language                     0
author.playtime_forever      0
author.playtime_at_review    0
votes_helpful                0
weighted_vote_score          0
rating                       0
playtime_norm                0
dtype: int64

Duplicados: 13808

Recommended:
 recommended
True     0.771595
False    0.228405
Name: proportion, dtype: float64

Idiomas top 10:
 language
english    2806669
Name: count, dtype: int64
