In [2]:
import pandas as pd
from textblob import TextBlob

In [3]:
# Cargamos el dataset 'user_reviews' para realizar el 'sentiment_analysis'.
path = '/content/drive/MyDrive/data/user_reviews.csv'
df_reviews = pd.read_csv(path, encoding='utf-8', lineterminator='\n')
df_reviews.head()

Unnamed: 0,item_id,helpful,recommend,review,user_id,posted_year
0,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,2011
1,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,2011
2,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,2011
3,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637,2014
4,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637,2013


In [4]:
# Para aplicar el análisis de sentimiento con NLP, vamos a utilizar la biblioteca TextBlob porque se considera facil de usar y muy intuitiva.

# Creamos una función para aplicar el análisis de sentimiento
def sentiment_analysis(review):
    # Si la reseña está ausente, retorna 1 (neutral)
    if pd.isnull(review):
        return 1

    # Calcula la polaridad de la reseña usando TextBlob
    polarity = TextBlob(review).sentiment.polarity

    # Retorna 0 (malo) si la polaridad es menor que 0, 2 (positivo) si la polaridad es mayor que 0, y 1 (neutral) en caso contrario
    if polarity < 0:
        return 0
    elif polarity > 0:
        return 2
    else:
        return 1

In [5]:
# Aplicamos el análisis de sentimiento a la columna 'review'.
df_reviews['sentiment_analysis'] = df_reviews['review'].apply(sentiment_analysis)

# Eliminamos la columna 'review'.
df_reviews_sa = df_reviews.drop('review', axis=1)
df_reviews_sa.head()

Unnamed: 0,item_id,helpful,recommend,user_id,posted_year,sentiment_analysis
0,1250,No ratings yet,True,76561197970982479,2011,2
1,22200,No ratings yet,True,76561197970982479,2011,2
2,43110,No ratings yet,True,76561197970982479,2011,2
3,251610,15 of 20 people (75%) found this review helpful,True,js41637,2014,2
4,227300,0 of 1 people (0%) found this review helpful,True,js41637,2013,0


In [6]:
# Vamos a crear diferentes df para cada funcion que luego seran consumidos por la API.

# Para las funciones 'PlayTimeGenre' y ' UserForGenre' necesitamos las columnas:
# 'release_year', 'genres', 'item_id' del dataset 'steam_games'
# 'item_id', 'item_name','playtime_forever','user_id' del dataset 'user_items'

# Cargamos el dataset 'steam_games'.
path = '/content/drive/MyDrive/data/steam_games.csv'
df_games = pd.read_csv(path, encoding='utf-8', lineterminator='\n')
df_games.head()

Unnamed: 0,publisher,genres,item_name,tags,specs,price,early_access,item_id,developer,release_year
0,Kotoshiro,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",['Single-player'],4.99,False,761140,Kotoshiro,2018
1,"Making Fun, Inc.","['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...","['Single-player', 'Multi-player', 'Online Mult...",0.0,False,643980,Secret Level SRL,2018
2,Poolians.com,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu...","['Single-player', 'Multi-player', 'Online Mult...",0.0,False,670290,Poolians.com,2017
3,彼岸领域,"['Action', 'Adventure', 'Casual']",弹炸人2222,"['Action', 'Adventure', 'Casual']",['Single-player'],0.99,False,767400,彼岸领域,2017
4,unknown,"['Action', 'Indie', 'Casual', 'Sports']",Log Challenge,"['Action', 'Indie', 'Casual', 'Sports']","['Single-player', 'Full controller support', '...",2.99,False,773570,unknown,-1


In [7]:
# Cargamos el dataset 'users_items'.
path = '/content/drive/MyDrive/data/user_items.csv'
df_items = pd.read_csv(path, encoding='utf-8', lineterminator='\n')
df_items.head()

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count
0,10,Counter-Strike,6,76561197970982479,277
1,20,Team Fortress Classic,0,76561197970982479,277
2,30,Day of Defeat,7,76561197970982479,277
3,40,Deathmatch Classic,0,76561197970982479,277
4,50,Half-Life: Opposing Force,0,76561197970982479,277


In [8]:
# Seleccionamos solo las columnas necesarias para las funciones 'PlayTimeGenre()' y 'UserForGenre()'.
df_games_subset = df_games[['item_id', 'genres', 'release_year']]
df_items_subset = df_items[['item_id', 'playtime_forever', 'user_id']]

# Unimos los DataFrames.
df_user_genre = pd.merge(df_games_subset, df_items_subset, on='item_id')
df_user_genre.head()

Unnamed: 0,item_id,genres,release_year,playtime_forever,user_id
0,282010,"['Action', 'Indie', 'Racing']",1997,5,UTNerd24
1,282010,"['Action', 'Indie', 'Racing']",1997,0,I_DID_911_JUST_SAYING
2,282010,"['Action', 'Indie', 'Racing']",1997,0,76561197962104795
3,282010,"['Action', 'Indie', 'Racing']",1997,0,r3ap3r78
4,282010,"['Action', 'Indie', 'Racing']",1997,13,saint556


In [9]:
df_user_genre.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4244831 entries, 0 to 4244830
Data columns (total 5 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   item_id           int64 
 1   genres            object
 2   release_year      int64 
 3   playtime_forever  int64 
 4   user_id           object
dtypes: int64(3), object(2)
memory usage: 194.3+ MB


In [10]:
# Seleccionamos solo las columnas necesarias para la funciones 'UsersRecommend()' y 'UsersNotRecommend()'
df_games_subset = df_games[['item_id', 'item_name']]
df_reviews_subset = df_reviews[['item_id', 'recommend','sentiment_analysis','posted_year']]

# Unimos los DataFrames
df_recommend = pd.merge(df_games_subset, df_reviews_subset, on='item_id')
df_recommend.head()

Unnamed: 0,item_id,item_name,recommend,sentiment_analysis,posted_year
0,282010,Carmageddon Max Pack,True,1,-1
1,70,Half-Life,True,0,2015
2,70,Half-Life,True,0,2011
3,70,Half-Life,True,0,2014
4,70,Half-Life,True,2,2013


In [11]:
df_recommend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53960 entries, 0 to 53959
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_id             53960 non-null  int64 
 1   item_name           53960 non-null  object
 2   recommend           53960 non-null  bool  
 3   sentiment_analysis  53960 non-null  int64 
 4   posted_year         53960 non-null  int64 
dtypes: bool(1), int64(3), object(1)
memory usage: 2.1+ MB


In [12]:
# Por ultimo, seleccionamos solo las columnas necesarias para la funcion 'sentiment_analysis()'
df_games_subset = df_games[['item_id', 'release_year']]
df_reviews_subset = df_reviews[['item_id', 'sentiment_analysis']]

# Unimos los DataFrames
df_sentiment = pd.merge(df_games_subset, df_reviews_subset, on='item_id', how='inner')
df_sentiment.head()

Unnamed: 0,item_id,release_year,sentiment_analysis
0,282010,1997,1
1,70,1998,0
2,70,1998,0
3,70,1998,0
4,70,1998,2


In [13]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53960 entries, 0 to 53959
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   item_id             53960 non-null  int64
 1   release_year        53960 non-null  int64
 2   sentiment_analysis  53960 non-null  int64
dtypes: int64(3)
memory usage: 1.6 MB


In [15]:
# Exportamos a CSV
dfs = [df_user_genre, df_recommend, df_sentiment, df_reviews_sa]
# Nombres correspondientes a cada DataFrame
names = ['df_user_genre', 'df_recommend', 'df_sentiment', 'df_reviews_sa']

for dfs, n in zip(dfs, names):
    path = f'data/{n}.csv'
    dfs.to_csv(path, index=False, encoding='utf-8')
    print(f"'{n}' fue guardado correctamente en '{path}'")

'df_user_genre' fue guardado correctamente en 'data/df_user_genre.csv'
'df_recommend' fue guardado correctamente en 'data/df_recommend.csv'
'df_sentiment' fue guardado correctamente en 'data/df_sentiment.csv'
'df_reviews_sa' fue guardado correctamente en 'data/df_reviews_sa.csv'
