In [1]:

import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from multiprocessing import Pool
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
from memory_profiler import memory_usage
import math
from ast import literal_eval

### Before starting our ETL, we did a prior processing of the original databases with the tool "codebeautyfy.org/json-fixer"
### To obtain valid json files.

In [2]:
# Constant file Paths [0] = Games [1] = reviews [2] = items
FILE_NAMES = ['.\\Datasets\\steam_games.json.gz','.\\Datasets\\user_reviews_fixed.json.gz','.\\Datasets\\users_items_fixed.json.gz']

In [3]:
def file_to_dataframe(file):
    """
    Create a pandas dataframe from a JSON file.

    Parameters
    ----------
    file : str
        The path or URL of the JSON file.

    Returns
    -------
    file_data_frame : pd.DataFrame
        The dataframe created from the JSON file.

    Raises
    ------
    ValueError
        If the JSON file is not valid or cannot be read.
    """
    try:
        file_data_frame = pd.read_json(file, compression='gzip',lines=True)
        return file_data_frame
    except ValueError:
        file_data_frame = pd.read_json(file, compression='gzip')
        return file_data_frame
    except:
        raise ValueError('Something went wrong, Dataframe not created')

In [4]:
# Load Dataframe with Json Files
steam_games_df = file_to_dataframe(FILE_NAMES[0])
user_reviews_df = file_to_dataframe(FILE_NAMES[1])
user_items_df = file_to_dataframe(FILE_NAMES[2])


#### Null Treatment for dataframes

In [5]:
steam_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [6]:
user_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [7]:
user_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  int64 
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


In [8]:
def drop_empty_rows(dataframe):
    """
    Drop rows from a dataframe that contain only missing values.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The dataframe to drop rows from.

    Returns
    -------
    pd.DataFrame
        The dataframe with empty rows dropped. The original dataframe is modified in place.
    """
    dataframe.dropna(how='all', inplace=True)
    return dataframe

# Function call to drop all the empty rows
steam_games_df = drop_empty_rows(steam_games_df)
user_reviews_df = drop_empty_rows(user_reviews_df)
user_items_df = drop_empty_rows(user_items_df)

#### Removing not used Columns

In [9]:
# It was found that app_name and title are essentially the same. Because title has empty values and app_name does not, it was decided to drop title.

def compare_name_title(steam_games_df):
    """
    Compare the app_name and title columns of a dataframe of Steam games.

    This function prints the number of null values in each column and displays
    a subset of the dataframe where the app_name and title are different and
    not null.

    Parameters
    ----------
    steam_games_df : pd.DataFrame
        The dataframe of Steam games to compare.

    Returns
    -------
    None
        The function does not return anything. It only prints and displays data.
    """ 
    
    print(f"Nulls in title : {steam_games_df['title'].isnull().sum()}")
    print(f"Nulls in app_name: {steam_games_df['app_name'].isnull().sum()}")
    
    compare_appname_title = steam_games_df[(steam_games_df['app_name'] != steam_games_df['title']) & (~steam_games_df['app_name'].isnull()) & (~steam_games_df['title'].isnull())]
    display(compare_appname_title[['app_name','title']])

compare_name_title(steam_games_df)

Nulls in title : 2050
Nulls in app_name: 2


Unnamed: 0,app_name,title
88390,Sam & Max 101: Culture Shock,Sam &amp; Max 101: Culture Shock
88393,Sam & Max 102: Situation: Comedy,Sam &amp; Max 102: Situation: Comedy
88419,Command & Conquer: Red Alert 3,Command &amp; Conquer: Red Alert 3
88492,Heroes of Might & Magic V: Hammers of Fate,Heroes of Might &amp; Magic V: Hammers of Fate
88494,Heroes of Might & Magic V: Tribes of the East,Heroes of Might &amp; Magic V: Tribes of the East
...,...,...
120181,Sam & Max 105: Reality 2.0,Sam &amp; Max 105: Reality 2.0
120182,Sam & Max 104: Abe Lincoln Must Die!,Sam &amp; Max 104: Abe Lincoln Must Die!
120183,Sam & Max 106: Bright Side of the Moon,Sam &amp; Max 106: Bright Side of the Moon
120208,Making History: The Calm & the Storm,Making History: The Calm &amp; the Storm


In [10]:
#Dropping columns that we won't use
# steam_Games | reviews_url: we won't use these urls for any of our analyses, data handling, or data visualization
#             | title: we found that the title is the same as the app_name, but with a lot of blank values. We decided to drop them.
#user_reviews | user_url: we won't use these urls for any of our analyses, data handling, or data visualization
#user_items   | user_url: we won't use these urls for any of our analyses, data handling, or data visualization

#Limitations: this is not currently planned for the project, but we could improve the database by scraping the data from the urls

steam_games_df = steam_games_df.drop('reviews_url', axis=1)
steam_games_df = steam_games_df.drop('title', axis=1)

# Drop 2 nulls in app_name
steam_games_df = steam_games_df.dropna(subset=['app_name'])

user_items_df = user_items_df.drop('user_url',axis=1)
user_reviews_df = user_reviews_df.drop('user_url', axis=1)

#### Validation of numerical columns

In [11]:
# Steam | Price
not_numeric = []

def not_numeric_values(value):
    """ 
    Check if a value is numeric or not.

    Parameters
    ----------
    value : any
        The value to be checked.

    Returns
    -------
    None
        If the value is numeric, return None.

    Raises
    ------
    ValueError
        If the value is not numeric, append it to the not_numeric list.
    TypeError
        If the value is of an invalid type, append -1 to the not_numeric list.
    """
    try : 
        num = float(value)
        return None
    except ValueError:
        not_numeric.append(value)
    except TypeError:
        not_numeric.append(-1)


def convert_price(price):
    """ 
    Convert a price string to a numerical value.

    Parameters
    ----------
    price : str
        The price string to be converted.

    Returns
    -------
    float or str
    If the price string is one of the predefined values that indicate free or no cost, return 0.0.
    If the price string is one of the predefined values that indicate a starting price, return the numerical value after the '$' sign.
    Otherwise, return the price string as it is. 
    """
    if price in ['Free Demo','Free to Play','Free To Play','Play for Free!','Free', 'Install Now', 'Play WARMACHINE: Tactics Demo', 
                 'Free Mod','Install Theme','Third-party','Play Now','Free HITMAN™ Holiday Pack','Install Now','Play the Demo','Free to Try',
                 'Free Movie','Free to Use']:
        return 0
    elif price == 'Starting at $499.00':
        return 499.00
    elif price == 'Starting at $449.00':
        return 449.00
    else:
        return price

# We apply the price conversion to all the data of type String
steam_games_df.loc[:,'price'] = steam_games_df['price'].apply(convert_price)

# Function used to find the non-numeric data
steam_games_df['price'].apply(not_numeric_values)        
print(not_numeric) 

# Fill the empty data with the mean of the column
# Decided to use the mean because it is less affected by outliers
# Decided not to drop these data to avoid affecting future functions based on the price of games

steam_games_df['price'] = steam_games_df['price'].fillna(steam_games_df['price'].median())
steam_games_df.info()
steam_games_df

[]
<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 88310 to 120444
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28851 non-null  object 
 2   app_name      32133 non-null  object 
 3   url           32133 non-null  object 
 4   release_date  30067 non-null  object 
 5   tags          31971 non-null  object 
 6   specs         31464 non-null  object 
 7   price         32133 non-null  float64
 8   early_access  32133 non-null  float64
 9   id            32132 non-null  float64
 10  developer     28836 non-null  object 
dtypes: float64(3), object(8)
memory usage: 2.9+ MB


Unnamed: 0,publisher,genres,app_name,url,release_date,tags,specs,price,early_access,id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],4.99,0.0,761140.0,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",0.00,0.0,643980.0,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",0.00,0.0,670290.0,Poolians.com
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",[Single-player],0.99,0.0,767400.0,彼岸领域
88314,,,Log Challenge,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]","[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,
...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]","[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]","[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]","[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...","[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


In [12]:
steam_games_df['release_date'] = pd.to_datetime(steam_games_df['release_date'], format='%Y-%m-%d', errors='coerce')
steam_games_df['year_of_release'] = steam_games_df['release_date'].dt.year
steam_games_df.head()

# I'm not going to touch the null values. If I need to, I'll fill them in with 2016, which is the average.
# Right now, I don't want to fill in the null values with the average because it will change the number of hours played per year.

Unnamed: 0,publisher,genres,app_name,url,release_date,tags,specs,price,early_access,id,developer,year_of_release
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],4.99,0.0,761140.0,Kotoshiro,2018.0
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",0.0,0.0,643980.0,Secret Level SRL,2018.0
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",0.0,0.0,670290.0,Poolians.com,2017.0
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",[Single-player],0.99,0.0,767400.0,彼岸领域,2017.0
88314,,,Log Challenge,http://store.steampowered.com/app/773570/Log_C...,NaT,"[Action, Indie, Casual, Sports]","[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,,


#### FEATURING ENGINEERING

#### Reviews Sentiment Analysis

In [13]:
#  Downloads the VADER lexicon

# nltk.download("vader_lexicon")

In [14]:
def get_sentiment(review):
    """
    Calculate the sentiment score of a review.

    Parameters
    ----------
    review : str
        The review text to be analyzed.

    Returns
    -------
    float
        The sentiment score of the review, ranging from -1.0 (very negative) to 1.0 (very positive).

    Notes
    -----
    This function uses the SentimentIntensityAnalyzer class from the nltk.sentiment.vader module, which implements the VADER 
    (Valence Aware Dictionary and sEntiment Reasoner) algorithm for sentiment analysis .
    """
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(review)
    return sentiment['compound']

def get_user_sentiment(reviews):
    """"
    Calculate the sentiment score of each review in a list.

    Parameters
    ----------
    reviews : list of dict
        The list of reviews to be analyzed. 
        Each review is a dictionary with the following keys: 'review', 'posted', 'last_edited', 'item_id', and 'recommend'.

    Returns
    -------
    list of dict
        The list of dictionaries that contain the sentiment score of each review. 
        Each dictionary has the following keys: 'Posted: ', 'Last_Edited', 'item_id', 'recommend', and 'sentiment'. 
        The sentiment score is a float ranging from -1.0 (very negative) to 1.0 (very positive).
    """
    sentiments = []
    for review in reviews:
        sentiment = get_sentiment(review['review'])
        dict_review = [{'Posted: ':review['posted'], 'Last_Edited':review['last_edited'],'item_id':review['item_id'], 'recommend':review['recommend'] , 'sentiment':sentiment}]
        sentiments.append(dict_review)
    return sentiments

def add_sentiment_column(df):
    # Set the sentiment for the new column "sentiment"
    df['sentiment'] = user_reviews_df['reviews'].apply(get_user_sentiment)

def user_review_explode(review_df):
    """
    Transform a review_df into a output_df with normalized sentiment scores.

    Parameters
    ----------
    review_df : DataFrame
        The review_df to be transformed. It has two columns: 'user_id' and 'sentiment'. The 'sentiment' column contains lists of dictionaries with the keys: 'Posted: ', 'Last_Edited', 'item_id', 'recommend', and 'sentiment'.

    Returns
    -------
    output_df : DataFrame
    The output_df with normalized sentiment scores. It has six columns: 'user_id', 'Posted: ', 'Last_Edited', 'item_id', 'recommend', and 'sentiment'. The 'sentiment' column contains floats ranging from -1.0 (very negative) to 1.0 (very positive).
    """
    review_df = review_df[['user_id','sentiment']]
    review_df = review_df.explode('sentiment')
    review_df = review_df.explode('sentiment')
    hold_user = review_df[['user_id']]
    hold_user = hold_user.reset_index(drop=True)
    sentiment_exploded = pd.json_normalize(review_df['sentiment'])
    output_df = pd.concat([hold_user,sentiment_exploded], axis=1, join='inner')
    return output_df

In [15]:
# user_reviews_df

# Call the function to analyze and create the new "sentiment"
add_sentiment_column(user_reviews_df) 
user_reviews_df = user_review_explode(user_reviews_df)
user_reviews_df.head()


Unnamed: 0,user_id,Posted:,Last_Edited,item_id,recommend,sentiment
0,76561197970982479,"Posted November 5, 2011.",,1250,True,0.8481
1,76561197970982479,"Posted July 15, 2011.",,22200,True,0.2263
2,76561197970982479,"Posted April 21, 2011.",,43110,True,0.9117
3,js41637,"Posted June 24, 2014.",,251610,True,0.9566
4,js41637,"Posted September 8, 2013.",,227300,True,0.9708


In [16]:
# We work on a copy 
user_reviews_df_copy = user_reviews_df.copy()
# We had to create a new column because the .rename wasn't working
user_reviews_df_copy['Posted'] = user_reviews_df_copy['Posted: ']
user_reviews_df_copy = user_reviews_df_copy.drop(user_reviews_df_copy.columns[1], axis=1)

user_reviews_df_copy['Posted'] = user_reviews_df_copy['Posted'].astype(str)
user_reviews_df_copy['Last_Edited'] = user_reviews_df_copy['Last_Edited'].astype(str)
user_reviews_df_copy.head()

Unnamed: 0,user_id,Last_Edited,item_id,recommend,sentiment,Posted
0,76561197970982479,,1250,True,0.8481,"Posted November 5, 2011."
1,76561197970982479,,22200,True,0.2263,"Posted July 15, 2011."
2,76561197970982479,,43110,True,0.9117,"Posted April 21, 2011."
3,js41637,,251610,True,0.9566,"Posted June 24, 2014."
4,js41637,,227300,True,0.9708,"Posted September 8, 2013."


In [17]:
#####

def remove_string(string, word_to_remove):
  """"
  Remove a word from a string.

  Parameters
  ----------
  string : str
  The string to be modified.
  word_to_remove : str
  The word to be removed from the string.

  Returns
  -------
  str
  The modified string with the word_to_remove removed. If the string is None or empty, return the string as it is.
  """
  string = str(string)
  if string is not None and not '':
    return string.replace(word_to_remove, '')

def deal_with_string(string):
  """
  Modify a string based on its length.

  Parameters
  ----------
  string : str
    The string to be modified.

  Returns
  -------
  str
  """
  if string is not None and not '':
    split_string = string.split()
    if (len(split_string) == 1):
      #2021 was chosen because it is the latest release date for games. It is assumed that the review was written in the same year as the release.
      return '2021'
    if (len(split_string) == 2):
      return split_string[1]
    
def deal_with_las_edited(string):
  """
  Modify a string based on its length and content.

  Parameters
  ----------
  string : str
      The string to be modified.

  Returns
  -------
  str or None
  """
  if (string == None) or (string == ''):
    return None
  else:
    string = string.strip()
    split_string = string.split()
    if (len(split_string) == 2):
      return '2021'
    if (len(split_string) == 3):
      return split_string[2]
    return string

# List of words we want to remove to format the Date columns.
words_to_remove_list = ['January', ' February','March','April','May','June','July','August','September','October','November','December','.',',','Posted']
last_edited_remove_list = ['Last','Edited','edited',',','.','']

# Remove the days
numbers_to_remove = list(range(1,32))

# We remove everything that's not the date from Posted and Last_Edited
for word in words_to_remove_list:
  user_reviews_df_copy['Posted'] = user_reviews_df_copy['Posted'].apply(remove_string, word_to_remove=word)

for word_r in last_edited_remove_list:
  user_reviews_df_copy['Last_Edited'] = user_reviews_df_copy['Last_Edited'].apply(remove_string, word_to_remove=word_r)

# Apply functions to format the Date
user_reviews_df_copy['Posted'] = user_reviews_df_copy['Posted'].apply(deal_with_string)
user_reviews_df_copy['Last_Edited'] = user_reviews_df_copy['Last_Edited'].apply(deal_with_las_edited)

user_reviews_df_copy.head()

Unnamed: 0,user_id,Last_Edited,item_id,recommend,sentiment,Posted
0,76561197970982479,,1250,True,0.8481,2011
1,76561197970982479,,22200,True,0.2263,2011
2,76561197970982479,,43110,True,0.9117,2011
3,js41637,,251610,True,0.9566,2014
4,js41637,,227300,True,0.9708,2013


In [18]:
user_items_df.head()

Unnamed: 0,user_id,items_count,steam_id,items
0,76561197970982479,277,76561197970982480,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [19]:
# Decided to drop two more colums from user_items_df that we won't use.

user_items_df.drop('steam_id', axis=1)
user_items_df.drop('items_count', axis=1)

Unnamed: 0,user_id,steam_id,items
0,76561197970982479,76561197970982480,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,76561198035864384,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,76561198007712560,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,76561197963445856,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,76561198002099488,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...
88305,76561198323066619,76561198323066624,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,76561198326700688,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,76561198328759264,[]
88308,76561198329548331,76561198329548336,"[{'item_id': '304930', 'item_name': 'Unturned'..."


# PARQUETS

### Query 1

In [20]:
def get_frame_query1(steam_games_df):
    """"
    Extract a subset of columns from a steam_games_df.

    Parameters
    ----------
    steam_games_df : DataFrame
        The steam_games_df to be processed. It has multiple columns, including 'developer', 'price', and 'release_date'.

    Returns
    -------
    query1_parquet : DataFrame
        The query1_parquet with three columns: 'developer', 'price', and 'release_date'. It has the same number of rows as the steam_games_df.
    """
    query1_parquet = steam_games_df[['developer','price','release_date']]
    return query1_parquet

def api_query1_func(user, query1_parquet):
    """
    Test a query for the API that returns the count and free percent of items by year for a given developer.

    Parameters
    ----------
    user : str
    The name of the developer to be searched.
    query1_parquet : DataFrame
    The query1_parquet with three columns: 'developer', 'price', and 'release_date'. It has the same number of rows as the steam_games_df.

    Returns
    -------
    df_grouped : DataFrame
    """
    developers_year = query1_parquet.query("developer == @user")

    df_grouped = developers_year.groupby(query1_parquet['release_date'].dt.year).agg(
        count = ("developer", "count"), # count of items for that year
        free_percent = ("price", lambda x: round((x == 0).sum() / x.count() * 100, 2)) # percent of free items in price for that year
    )
    return df_grouped


In [21]:
# Call the function to retrieve neccesary data for the API
query1_parquet = get_frame_query1(steam_games_df)
query1_parquet.to_parquet('.\\Data\\API\\api_query1.parquet')

# Test API function
test_query1 = api_query1_func('Valve',query1_parquet)
print(test_query1.head())

              count  free_percent
release_date                     
1998.0            1           0.0
1999.0            1           0.0
2000.0            2           0.0
2001.0            1           0.0
2003.0            1           0.0


In [22]:
query1_parquet.head()

Unnamed: 0,developer,price,release_date
88310,Kotoshiro,4.99,2018-01-04
88311,Secret Level SRL,0.0,2018-01-04
88312,Poolians.com,0.0,2017-07-24
88313,彼岸领域,0.99,2017-12-07
88314,,2.99,NaT


### Query 2

In [23]:
# This first export of our data as a Parquet file is a test to see how it will behave in the API.

query_2_steam_parquet = steam_games_df[['price','id']]
query_2_steam_parquet.to_parquet('.\\Data\\query_2_steam.parquet')

query_2_items_parquet = user_items_df[['user_id','items']]
query_2_items_parquet.to_parquet('.\\Data\\query_2_items.parquet')

query_2_reviews_parquet = user_reviews_df[['user_id','recommend']]
query_2_reviews_parquet.to_parquet('.\\Data\\query_2_reviews.parquet')

In [24]:
query_2_items_parquet.head()

Unnamed: 0,user_id,items
0,76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


#### Limitations: The following functions can be easily optimized by not iterating over the dataframes
####              The functions were created as a test of queries from the API and need to be modified.

In [25]:
# Test users:

# userid = '76561197970982479'
# userid = 'erickoiv'
userid = 'jjas01'
# userid = '76561198079680944'

def read_parquets ():
    # Reads parquets from folder
    query_2_steam = pd.read_parquet('.\\Data\\query_2_steam.parquet')
    query_2_reviews = pd.read_parquet('.\\Data\\query_2_reviews.parquet')
    return query_2_steam, query_2_reviews

def load_parquet_in_batches(file_path,user_id):
    """Loads a Parquet file in batches.

    Args:
    file_path: The path to the Parquet file.
    chunk_size: The size of each batch.

    Returns:
    A list of Pandas DataFrames.
    """
    parquet_file = pq.ParquetFile(file_path)
    dataframes = []
    for batch in parquet_file.iter_batches(batch_size=5000):
        dataframes = batch.to_pandas()
        result = get_items_names(dataframes,user_id)
        if result is not None :
            return result
    return result

 
def get_items_names(items_dataframe,user_id):
    """"
    Get the items names for a given user_id from an items_dataframe.

    Parameters
    ----------
    items_dataframe : DataFrame
    The items_dataframe to be searched. It has multiple columns, including 'user_id' and 'item_name'.
    user_id : int
    The user_id to be matched.

    Returns
    -------
    DataFrame or None
    """
    items_dataframe = items_dataframe.set_index("user_id")    
    if user_id in items_dataframe.index:
        found_data_frame = items_dataframe.loc[user_id]

        if not found_data_frame.empty:
            return found_data_frame
    else:
        return None
    return None

def get_items_id(row_df):
    """
    Get the items ids from a row_df.

    Parameters
    ----------
    row_df : DataFrame
    The row_df to be processed. It has one row and one column. 
    The column contains a list of dictionaries, each with an 'item_id' key and a value.

    Returns
    -------
    list of int
    """
    items= []
    user_items = row_df.iloc[0]

    for item in user_items:
        items.append(item['item_id'])
    return items

def get_waste(items_list):
    """
    Calculate the total price of the items in a list.

    Parameters
    ----------
    items_list : list of str
    The items_list to be processed. It contains the ids of the items as strings.

    Returns
    -------
    float
    """
    waste = 0
    prices = []
    for item in items_list:
        intintem=int(item)
        price = query_2_steam.query("id == @intintem")

        if len(price['price'].values) > 0:
            waste = waste+price['price'].values[0]
            prices.append((price['price'].values))
    return waste


def percent_reviews(percent_user_id):
    """
    Calculate the percentage of positive reviews and the number of items for a given user_id.

    Parameters
    ----------
    percent_user_id : int
    The user_id to be matched.

    Returns
    -------
    percent : float
    """
    try:
        recommends = query_2_reviews.query("user_id == @percent_user_id")
        recommends_true = query_2_reviews.query("(user_id == @percent_user_id) & (recommend == True)")
        items_amount = len(recommends)
        percent = (len(recommends_true) * 100) / items_amount
        return percent, items_amount
    except ZeroDivisionError:
        return 0,0
    

def api_parquet(userid, total_waste, reviews_items_percent,total_reviews,api_df):
    """
    Add a new row to the api_df with the given values.

    Parameters
    ----------
    userid : int
    """
    api_df.loc[len(api_df.index)] = [userid,total_waste,reviews_items_percent,total_reviews]
     
    return (api_df)

def get_items_api(api_user_items):
    """
    Get the items ids from a list of dictionaries.

    Parameters
    ----------
    api_user_items : list of dict
    The api_user_items to be processed. It contains dictionaries with an 'item_id' key and a value.

    Returns
    -------
    list of int
    """
    items = []
    for item in api_user_items:
            items.append(item['item_id'])
    return items

# We make a new dataframe with empty columns
api_df = pd.DataFrame(columns=['user_id','total_waste','reviews_percent','total_reviews'])

# Load small Parquets
query_2_steam, query_2_reviews = read_parquets()

x=0

# Iterate over the user
for user in query_2_items_parquet.iterrows():
    # Get the user data from the row as a Series
    this_user_data = user[1]

    # Get the user id and items from the Series
    this_user = this_user_data['user_id']
    this_user_items = this_user_data['items']

    # Extract the items ids from the items list 
    this_user_items =get_items_api(this_user_data['items'])

    # Calculate the total waste and percentage of reviews
    total_waste = get_waste(this_user_items)
    reviews_items_percent, total_reviews = percent_reviews(this_user)

    api_df = api_parquet(this_user,total_waste,reviews_items_percent,total_reviews,api_df)

api_df.to_parquet('.\\Data\\API\\\api_query2.parquet')


KeyboardInterrupt: 

#### Query 3

In [26]:
# Prepare the DataFrames that we will use for this query.

query3_steam_df = steam_games_df[['genres','id','year_of_release']]
query3_user_items_df = user_items_df[['user_id','items']]

# 

In [27]:
# We Explode genres list and group by this genre creating a list of games by genre
query3_exploded_genre = query3_steam_df.explode('genres')

genre_grouped = query3_exploded_genre.groupby(['genres'])

genre_grouped = pd.DataFrame({'genres': genre_grouped['genres'].first(),
                            'id_list': genre_grouped['id'].apply(list)})

genre_grouped = genre_grouped.reset_index(drop=True)
genre_grouped['id_list'] = genre_grouped['id_list'].apply(set)
genre_grouped.head()

genre_grouped.to_parquet('.\\Data\\genre_grouped.parquet')


In [28]:
# Tests Values
genre = 'Action'
genres_all = ['Accounting', 'Action', 'Adventure', 'Animation &amp; Modeling', 'Audio Production', 'Casual', 'Design &amp; Illustration', 'Early Access', 'Education', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing']

Global_Max_User_id = ''
Global_Max_Time_Played = 0
api_most_played_genre = pd.DataFrame(columns=['genre','user_id','total_played_time'])

def read_parquets ():
    """
    Read a parquet file and return a pandas dataframe.

    Returns
    -------
    pandas.DataFrame
        A dataframe containing the data from the parquet file.
    """
    genre_grouped_steam = pd.read_parquet('.\\Data\\genre_grouped.parquet')
    return genre_grouped_steam

def load_parquet_in_batches(file_path,Global_Max_Time_Played, Global_Max_User_id,specific_genre_steam):
    """
    Loads a Parquet file in batches.

    Args:
    file_path: The path to the Parquet file.
    chunk_size: The size of each batch.

    Returns:
    A list of Pandas DataFrames.
    """
    parquet_file = pq.ParquetFile(file_path)
    dataframes = []
    for batch in parquet_file.iter_batches(batch_size=5000):
        dataframes = batch.to_pandas()
        Global_Max_Time_Played, Global_Max_User_id= check_global_user(dataframes,Global_Max_Time_Played, Global_Max_User_id,specific_genre_steam)
    return Global_Max_Time_Played, Global_Max_User_id

def get_frame_genre(genre_grouped_steam,genre):  
    """
    Get a dataframe of games for a given genre.

    Parameters
    ----------
    genre_grouped_steam : pandas.DataFrame
        A dataframe containing the data from the parquet file.
    genre : str
        The name of the genre to filter by.

    Returns
    -------
    pandas.DataFrame or None
    """
    genre_grouped_steam = genre_grouped_steam.set_index("genres")  
    if genre in genre_grouped_steam.index:
        found_data_frame = genre_grouped_steam.loc[genre] 
        return found_data_frame
    else:
        return None
    

def genre_check(genre_grouped_steam,item_id) :   
    """
    Check if a given item id belongs to a genre.

    Parameters
    ----------
    genre_grouped_steam : pandas.DataFrame
        A dataframe containing the data from the parquet file.
    item_id : str
        The id of the item to check.

    Returns
    -------
    bool
        True if the item id is in the genre, False otherwise.
    """       
    return float(item_id) in genre_grouped_steam['id_list']


def check_global_user (batch_user_items,Genre_Max_Player_Time, Genre_Max_Player,specific_genre_steam):
    """
    Check the global user who played the most time in a specific genre.

    Parameters
    ----------
    batch_user_items : pandas.DataFrame
        A dataframe containing the user ids and the items they played.
    Genre_Max_Player_Time : float
        The current maximum time played by a user in the genre.
    Genre_Max_Player : int
        The current user id of the user who played the most time in the genre.
    specific_genre_steam : pandas.DataFrame
        A dataframe containing the data of the specific genre.

    Returns
    -------
    float, int
        The updated maximum time played by a user in the genre, and the updated user id of the user who played the most time in the genre.
    """
    for row in batch_user_items.iterrows() :
        data_row = row[1]
        user_time_played = 0
        user_id_row = data_row['user_id']
        items_row = tuple(data_row['items'])

        for item in items_row:
            if float(item['item_id']) in specific_genre_steam['id_list']:
                user_time_played += float(item['playtime_forever'])
                
        if user_time_played > Genre_Max_Player_Time:
            Genre_Max_Player_Time = user_time_played
            Genre_Max_Player = user_id_row
    return Genre_Max_Player_Time, Genre_Max_Player
        
genre_grouped_steam = read_parquets()

# Test Snippet
# We iterate over our list genres_all to find the most Played game in that list

for genre in genres_all:
    specific_genre_steam = get_frame_genre(genre_grouped_steam,genre)
    Genre_Max_Player = ''
    Genre_Max_Player_Time = 0

    if specific_genre_steam is not None: Genre_Max_Player_Time, Genre_Max_Player = load_parquet_in_batches('.\\Data\\query_2_items.parquet',Genre_Max_Player_Time, Genre_Max_Player,specific_genre_steam)
    api_most_played_genre.loc[len(api_most_played_genre.index)] = [genre,Genre_Max_Player,Genre_Max_Player_Time]

# Save our DataSet
api_most_played_genre.to_parquet('.\\Data\\API\\api_query3_most_played_genre.parquet')
api_most_played_genre.head()



Unnamed: 0,genre,user_id,total_played_time
0,Accounting,,0.0
1,Action,Sp3ctre,1699307.0
2,Adventure,REBAS_AS_F-T,2191551.0
3,Animation &amp; Modeling,ScottyG555,168314.0
4,Audio Production,Lickidactyl,109916.0


In [29]:
# Tests Values
genres_all2 = ['Accounting', 'Action', 'Adventure', 'Animation &amp; Modeling', 'Audio Production', 'Casual', 'Design &amp; Illustration', 'Early Access', 'Education', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing']

# Playes and Games Dataframes
players_df = query3_user_items_df
games_df = query3_steam_df

# New Column to store the played time
games_df.loc[:,'played_time'] = 0

# We set the id as the index
games_df = games_df.set_index("id") 
games_df[['played_time']] =  games_df[['played_time']].fillna(0)

def sum_played_time_games_df(games_df, item_id, played_time):
    """
    Add the played time to the existing played time of a game in a dataframe.

    Parameters
    ----------
    games_df : pandas.DataFrame
        A dataframe containing the games and their played time.
    item_id : str
        The id of the game to update.
    played_time : float
        The amount of time to add to the game's played time.

    Returns
    -------
    None
    """
    if float(item_id) in games_df.index:
        games_df.at[float(item_id), 'played_time'] += played_time


def iterate_players(players_df):
    """
    Iterate over the players and their items and update the games dataframe.

    Parameters
    ----------
    players_df : pandas.DataFrame
        A dataframe containing the players and their items.

    Returns
    -------
    None
    """

    for items in players_df['items']:
        for item in items:
            sum_played_time_games_df(games_df,item['item_id'],item['playtime_forever'])

# We Iterate over the players to get their played time 
iterate_players(players_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df.loc[:,'played_time'] = 0


In [30]:
# We store our Games_Df ordered
see = games_df.sort_values('played_time', ascending = True)
see

Unnamed: 0_level_0,genres,year_of_release,played_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
761140.0,"[Action, Casual, Indie, Simulation, Strategy]",2018.0,0
564700.0,"[Adventure, Casual, Indie]",2017.0,0
627370.0,"[Action, Strategy]",2017.0,0
382560.0,"[Action, Adventure, Indie]",,0
543940.0,"[Casual, Indie]",2017.0,0
...,...,...,...
240.0,[Action],2004.0,112612047
230410.0,"[Action, Free to Play]",2013.0,124027703
105600.0,"[Action, Adventure, Indie, RPG]",2011.0,154974541
4000.0,"[Indie, Simulation]",2006.0,448366616


In [36]:
# Dataframe to store played time by year
genre_grouped_year = genre_grouped.copy()
# genre_grouped_year['genres'] = genre_grouped_year['genres'].str.strip().str.lower().str.replace('&', '').replace(';', '')
genre_grouped_year.set_index('genres', drop=True,inplace=True)
# List of the years of genre
list_years = list(range(1970,2022))

def make_years_columns():
    """
    Create columns for each year in the genre grouped dataframe.

    Parameters
    ----------
    None

    Returns
    -------
    None
    """
    for year in list_years:
        genre_grouped_year[year] = 0

make_years_columns()
for row in genre_grouped_year.iterrows():
    data = row[1]
    id_list = data['id_list']

    # Iteration for game
    for id in id_list:
        game_data = games_df.loc[id]
        release_year = game_data['year_of_release']
        
        if not isinstance(release_year, np.float64):
            continue
        if np.isnan(release_year):
            continue
        
        year = int(release_year)
        for genres_l in game_data['genres']:
    
            genre_grouped_year.loc[genres_l,year] += game_data['played_time']


In [37]:
genre_grouped_year.head()

Unnamed: 0_level_0,id_list,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accounting,"{555810.0, 732710.0, 623590.0, 620040.0, 41134...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Action,"{327680.0, 655360.0, 32770.0, 524290.0, 393220...",0,0,0,0,0,0,0,0,0,...,1625485806,1360264096,444099664,754543371,327885208,645061150,8917,0,0,0
Adventure,"{327680.0, 393220.0, 720900.0, 622620.0, 45879...",0,0,0,0,0,0,0,0,0,...,216798787,673715940,239031791,688135053,255883384,466155224,105,0,0,0
Animation &amp; Modeling,"{747520.0, 697860.0, 321540.0, 620040.0, 36302...",0,0,0,0,0,0,0,0,0,...,17914,1940813,267201,3973627,309052,0,0,0,0,0
Audio Production,"{620040.0, 497160.0, 281100.0, 281102.0, 28110...",0,0,0,0,0,0,0,0,0,...,0,280221,1733247,10102,459,0,0,0,0,0


### LIMITATION: We searched for the time by year that the Genre was played
###             It's possible that the solution was refering at the time per year for the player
###            

In [38]:
# We retrieve the dataframe were it's not 0 playtime in the year
genre_grouped_year = genre_grouped_year.loc[:, (genre_grouped_year != 0).any(axis=0)]
genre_grouped_year.head()


Unnamed: 0_level_0,id_list,1983,1984,1987,1988,1989,1990,1991,1992,1993,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accounting,"{555810.0, 732710.0, 623590.0, 620040.0, 41134...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Action,"{327680.0, 655360.0, 32770.0, 524290.0, 393220...",10746,1152,0,32486,1214,18856,5413,3730,44621,...,188500788,371246122,1111629716,1625485806,1360264096,444099664,754543371,327885208,645061150,8917
Adventure,"{327680.0, 393220.0, 720900.0, 622620.0, 45879...",10746,1152,7951,88478,15015,6645,4633,30391,88743,...,13938180,52780010,863731981,216798787,673715940,239031791,688135053,255883384,466155224,105
Animation &amp; Modeling,"{747520.0, 697860.0, 321540.0, 620040.0, 36302...",0,0,0,0,0,0,0,0,0,...,0,0,0,17914,1940813,267201,3973627,309052,0,0
Audio Production,"{620040.0, 497160.0, 281100.0, 281102.0, 28110...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,280221,1733247,10102,459,0,0


In [39]:
# Save our DF for Api use
genre_grouped_year.to_parquet('.\\Data\\API\\api_query3_years_genre.parquet')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


# QUERY 4

In [40]:
# We start working on copies of the DF's

query4_steam_df = steam_games_df.copy()
query4_user_reviews_df = user_reviews_df_copy

# We drop the columns we're not going to use | We could select instead of dropping
query4_steam_df = query4_steam_df.drop('genres', axis=1)
query4_steam_df = query4_steam_df.drop('publisher', axis=1)
query4_steam_df = query4_steam_df.drop('url', axis=1)
query4_steam_df = query4_steam_df.drop('tags', axis=1)
query4_steam_df = query4_steam_df.drop('specs', axis=1)
query4_steam_df = query4_steam_df.drop('price', axis=1)
query4_steam_df = query4_steam_df.drop('early_access', axis=1)
query4_steam_df = query4_steam_df.drop('release_date', axis=1)
query4_steam_df = query4_steam_df.drop('app_name', axis=1)

In [41]:
# Get te positive and recommended reviews
query4_reviews_filtered = query4_user_reviews_df[(query4_user_reviews_df["recommend"] == True) & (query4_user_reviews_df["sentiment"] > 0)]

years_list = list(range(2010,2022))
steam_with_years = query4_steam_df

# Set new columns for each year
for year in years_list:
    steam_with_years[year] = 0

# Set 'id' as the index
steam_with_years.set_index('id',inplace=True) 

def sum_user_point(query4_reviews_filtered,steam_with_years):
    """
    Count the number of positive reviews for each game in each year.

    Parameters
    ----------
    query4_reviews_filtered : pandas.DataFrame
    A dataframe of reviews filtered by positive reviews.
    steam_with_years : pandas.DataFrame
    A dataframe with all the years that will be used.

    Returns
    -------
    None
    """
    for row in query4_reviews_filtered.iterrows():

        user_data = row[1]
        item_id = int(user_data['item_id'])
        posted = int(user_data['Posted'])
        edited = user_data['Last_Edited']
        
        if edited == None:
            if item_id in steam_with_years.index:
                steam_with_years.loc[item_id,posted] += 1

# Call the function to sum points to each Year
sum_user_point (query4_reviews_filtered, steam_with_years)


In [42]:
query4_reviews_filtered.head()

Unnamed: 0,user_id,Last_Edited,item_id,recommend,sentiment,Posted
0,76561197970982479,,1250,True,0.8481,2011
1,76561197970982479,,22200,True,0.2263,2011
2,76561197970982479,,43110,True,0.9117,2011
3,js41637,,251610,True,0.9566,2014
4,js41637,,227300,True,0.9708,2013


In [43]:
# We group the frame by year and add the points
steam_with_years_pivot_table = steam_with_years.groupby(['developer']).agg('sum')
steam_with_years_pivot_table.to_parquet('.\\Data\\API\\api_query4_top_developes.parquet')

# Test to the function call 
steam_with_years_pivot_table = steam_with_years_pivot_table.sort_values(by=2015, ascending=False)
steam_with_years_pivot_table


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


Unnamed: 0_level_0,year_of_release,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
developer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Valve,68279.0,13,100,226,845,2112,1517,0,0,0,0,0,599
Facepunch Studios,4019.0,2,14,26,240,737,298,0,0,0,0,0,138
Smartly Dressed Games,4031.0,0,0,0,0,349,169,0,0,0,0,0,77
Re-Logic,4026.0,0,15,24,93,166,142,0,0,0,0,0,31
Bohemia Interactive,62402.0,0,2,7,73,302,113,0,0,0,0,0,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"German Airports Team,Aerosoft GmbH",2014.0,0,0,0,0,0,0,0,0,0,0,0,0
"German Airports Team,Caipirinha Games,Omar Masroor",2017.0,0,0,0,0,0,0,0,0,0,0,0,0
"German Airports Team,Omar Masroor,Caipirinha Games",2017.0,0,0,0,0,0,0,0,0,0,0,0,0
Gestalt Development Studio,2017.0,0,0,0,0,0,0,0,0,0,0,0,0


## Query 5

In [73]:
# We make a copy of the DF
query5_steam_df = steam_games_df.copy()
query5_user_reviews_df = user_reviews_df_copy

In [74]:
# This time we select the rows we'll be working on instead of Dropping them
query5_steam_df = query5_steam_df[['id','developer']]

# Selec the necessary rows
query5_user_reviews_df = query5_user_reviews_df[['item_id','sentiment']]
# query5_user_reviews_df.set_index('item_id', drop=True, inplace=True)

# Create two new columns to store the reviews points
dev_reviews_amount_df = query5_steam_df
dev_reviews_amount_df['Positive'] = 0
dev_reviews_amount_df['Negative'] = 0

# We make the 'id' as our index
dev_reviews_amount_df.set_index('id', inplace = True)
dev_reviews_amount_df

Unnamed: 0_level_0,developer,Positive,Negative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
761140.0,Kotoshiro,0,0
643980.0,Secret Level SRL,0,0
670290.0,Poolians.com,0,0
767400.0,彼岸领域,0,0
773570.0,,0,0
...,...,...,...
773640.0,"Nikita ""Ghost_RUS""",0,0
733530.0,Sacada,0,0
610660.0,Laush Dmitriy Sergeevich,0,0
658870.0,"xropi,stev3ns",0,0


In [90]:
def sum_reviews (dev_reviews_amount_df, query5_user_reviews_df):
    """
    Sum the number of positive and negative reviews for each game in the dev reviews dataframe.

    Parameters
    ----------
    dev_reviews_amount_df : pandas.DataFrame
        A dataframe containing the games and their developers, and the number of positive and negative reviews.
    query5_user_reviews_df : pandas.DataFrame
        A dataframe containing the user reviews and their sentiment scores.

    Returns
    -------
    pandas.DataFrame
        The updated dev reviews dataframe with the summed positive and negative reviews for each developer.
    """
    xnan = 0
    xpos = 0
    xneg = 0
    for row in query5_user_reviews_df.iterrows():
        print(xnan,xpos,xneg)
        item_data = row[1]
        sentiment_data = item_data['sentiment']
        item_id = item_data['item_id']

        if np.isnan(sentiment_data):
            xnan += 1
            continue
        
        if (int(item_id) not in dev_reviews_amount_df.index):
            xnan += 1
            continue

        if (sentiment_data >= 0):
            xpos += 1
            dev_reviews_amount_df.loc[int(item_id),'Positive'] += 1
        else:
            xneg +=1
            dev_reviews_amount_df.loc[int(item_id),'Negative'] += 1
    return dev_reviews_amount_df

# Calls the function to sum reviews
dev_reviews_amount_df = sum_reviews(dev_reviews_amount_df,query5_user_reviews_df)

0 0 0
0 1 0
0 2 0
1 2 0
2 2 0
2 3 0
2 4 0
2 5 0
2 6 0
2 7 0
2 8 0
2 9 0
2 10 0
2 11 0
2 12 0
2 13 0
2 14 0
2 14 1
2 15 1
2 16 1
2 17 1
2 18 1
2 19 1
2 20 1
2 21 1
2 22 1
2 23 1
2 24 1
2 25 1
2 26 1
3 26 1
3 27 1
3 28 1
3 29 1
3 30 1
3 31 1
3 32 1
3 33 1
3 34 1
3 35 1
3 36 1
3 37 1
3 38 1
3 39 1
3 39 2
3 40 2
3 41 2
3 42 2
3 43 2
4 43 2
4 44 2
4 45 2
4 46 2
4 47 2
4 48 2
4 49 2
4 49 3
4 49 4
4 50 4
4 51 4
4 51 5
4 52 5
4 53 5
4 53 6
4 54 6
4 55 6
4 56 6
4 57 6
4 58 6
4 59 6
4 60 6
4 61 6
4 62 6
4 63 6
4 64 6
4 65 6
4 66 6
5 66 6
6 66 6
6 67 6
6 68 6
6 69 6
6 70 6
6 71 6
6 72 6
7 72 6
7 73 6
7 74 6
7 74 7
7 75 7
8 75 7
8 76 7
9 76 7
9 77 7
9 78 7
9 79 7
9 80 7
9 81 7
9 82 7
9 83 7
9 84 7
9 85 7
9 86 7
9 87 7
9 88 7
9 89 7
9 90 7
9 91 7
9 92 7
10 92 7
10 93 7
10 94 7
11 94 7
11 95 7
11 96 7
11 97 7
11 98 7
11 99 7
11 100 7
11 101 7
11 102 7
11 103 7
11 103 8
11 104 8
11 105 8
11 106 8
11 107 8
11 108 8
11 109 8
11 109 9
11 110 9
11 111 9
11 112 9
11 112 10
11 113 10
11 114 10
11 115 10
11

In [91]:
# Small test
dev_reviews_amount_df.loc[1250]

developer    Tripwire Interactive
Positive                      549
Negative                      128
Name: 1250.0, dtype: object

In [92]:
dev_reviews_amount_df.head()

Unnamed: 0_level_0,developer,Positive,Negative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
761140.0,Kotoshiro,0,0
643980.0,Secret Level SRL,0,0
670290.0,Poolians.com,0,0
767400.0,彼岸领域,0,0
773570.0,,0,0


In [93]:
# Test to sum rows
sum_pos = dev_reviews_amount_df['Positive'].sum()
sum_neg = dev_reviews_amount_df['Negative'].sum()

print (sum_pos, sum_neg)

90428 17568


In [94]:
# Group by developers and save to our folder
dev_reviews_amount_df_final = dev_reviews_amount_df.groupby(['developer']).agg('sum')
dev_reviews_amount_df_final.to_parquet('.\\Data\\API\\api_query5_dev_sent.parquet')
dev_reviews_amount_df_final

Unnamed: 0_level_0,Positive,Negative
developer,Unnamed: 1_level_1,Unnamed: 2_level_1
+7 Software,0,0
"+Mpact Games, LLC.",0,0
.M.Y.W.,0,0
.ez Games,0,0
07th Expansion,4,2
...,...,...
致意,0,0
萌石游戏,0,0
高考恋爱委员会,0,0
"高考恋爱委员会,Days",0,0


# ML1 MODEL

In [95]:
# Make copies of the frame we'll be using
ml1_steam_games_df = steam_games_df.copy()
ml1_user_items = user_items_df.copy()
ml1_user_reviews = user_reviews_df_copy.copy()

In [96]:
ml1_user_items.head()

Unnamed: 0,user_id,items_count,steam_id,items
0,76561197970982479,277,76561197970982480,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [97]:
# Make copies and select Rows we will use
ml1_games_featuers = ml1_steam_games_df.copy()
ml1_games_featuers = ml1_games_featuers[['id','publisher','specs','price','genres','year_of_release','developer','early_access']]
ml1_games_featuers.loc[:,'user_count'] = 0
ml1_games_featuers.loc[:,'user_sentiment'] = 0
ml1_games_featuers.set_index('id',inplace=True)
ml1_games_featuers.head()

Unnamed: 0_level_0,publisher,specs,price,genres,year_of_release,developer,early_access,user_count,user_sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
761140.0,Kotoshiro,[Single-player],4.99,"[Action, Casual, Indie, Simulation, Strategy]",2018.0,Kotoshiro,0.0,0,0
643980.0,"Making Fun, Inc.","[Single-player, Multi-player, Online Multi-Pla...",0.0,"[Free to Play, Indie, RPG, Strategy]",2018.0,Secret Level SRL,0.0,0,0
670290.0,Poolians.com,"[Single-player, Multi-player, Online Multi-Pla...",0.0,"[Casual, Free to Play, Indie, Simulation, Sports]",2017.0,Poolians.com,0.0,0,0
767400.0,彼岸领域,[Single-player],0.99,"[Action, Adventure, Casual]",2017.0,彼岸领域,0.0,0,0
773570.0,,"[Single-player, Full controller support, HTC V...",2.99,,,,0.0,0,0


In [98]:
# Select Rows and explode the items in this casae items = games
ml1_user_items.drop(['user_id','items_count','steam_id'],axis=1,inplace=True)
ml1_user_items_exploded = ml1_user_items.explode('items')
ml1_user_items_exploded.head()

Unnamed: 0,items
0,"{'item_id': '10', 'item_name': 'Counter-Strike..."
0,"{'item_id': '20', 'item_name': 'Team Fortress ..."
0,"{'item_id': '30', 'item_name': 'Day of Defeat'..."
0,"{'item_id': '40', 'item_name': 'Deathmatch Cla..."
0,"{'item_id': '50', 'item_name': 'Half-Life: Opp..."


In [99]:
x = 0
# Cicle to find features of each item
for row in ml1_user_items_exploded.itertuples():
    items_datax = row[1]
    
    if pd.isna(items_datax):
        continue
    item_id = int(items_datax['item_id'])
    if item_id in ml1_games_featuers.index:
        ml1_games_featuers.loc[item_id,'user_count'] += 1

In [100]:
ml1_games_featuers.head()

Unnamed: 0_level_0,publisher,specs,price,genres,year_of_release,developer,early_access,user_count,user_sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
761140.0,Kotoshiro,[Single-player],4.99,"[Action, Casual, Indie, Simulation, Strategy]",2018.0,Kotoshiro,0.0,0,0
643980.0,"Making Fun, Inc.","[Single-player, Multi-player, Online Multi-Pla...",0.0,"[Free to Play, Indie, RPG, Strategy]",2018.0,Secret Level SRL,0.0,0,0
670290.0,Poolians.com,"[Single-player, Multi-player, Online Multi-Pla...",0.0,"[Casual, Free to Play, Indie, Simulation, Sports]",2017.0,Poolians.com,0.0,0,0
767400.0,彼岸领域,[Single-player],0.99,"[Action, Adventure, Casual]",2017.0,彼岸领域,0.0,0,0
773570.0,,"[Single-player, Full controller support, HTC V...",2.99,,,,0.0,0,0


In [101]:
# Copy of features
ml1_games_featuers_sent = ml1_games_featuers.copy()

In [102]:
# Cicle reviews to sum the sentiment to each item
for row in ml1_user_reviews.itertuples():
    # print(f'ROW: {row}')
    if (pd.isna(row)):
        continue
    if (pd.isna(row[5])):
        continue
    if (pd.isna(row[3])):
        continue

    sentiment_data = float(row[5])
    game_id = int(row[3])

    if game_id in ml1_games_featuers_sent.index:
        ml1_games_featuers_sent.loc[game_id,'user_sentiment'] += sentiment_data

  ml1_games_featuers_sent.loc[game_id,'user_sentiment'] += sentiment_data


In [None]:
ml1_games_featuers_sent.head()

In [103]:
# Save to our ML Folder
ml1_games_featuers_sent.to_parquet('.\\Data\\\ML\\ml1_games.parquet')

# ML 2 MODEL

In [104]:
# Copy of the Frames we will be using
ml2_steam_games_df = steam_games_df.copy()
ml2_user_items = user_items_df.copy()
ml2_user_reviews = user_reviews_df_copy.copy()

In [105]:
ml2_steam_games_df.head()

Unnamed: 0,publisher,genres,app_name,url,release_date,tags,specs,price,early_access,id,developer,year_of_release
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],4.99,0.0,761140.0,Kotoshiro,2018.0
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",0.0,0.0,643980.0,Secret Level SRL,2018.0
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",0.0,0.0,670290.0,Poolians.com,2017.0
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",[Single-player],0.99,0.0,767400.0,彼岸领域,2017.0
88314,,,Log Challenge,http://store.steampowered.com/app/773570/Log_C...,NaT,"[Action, Indie, Casual, Sports]","[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,,


In [None]:
ml2_user_items.head()

#### Data we could use 
user_id | most_played_genre | most_played_spec |  most_played_tag | price_game | year_of_release |  most_reviewd_genre | most_reviews_tag | most_reviews_spec

In [106]:
# Copy of the items
ml_user_base = ml2_user_items.copy()
ml_user_base = ml_user_base[['user_id']]
ml_user_base.head()

Unnamed: 0,user_id
0,76561197970982479
1,js41637
2,evcentric
3,Riot-Punch
4,doctr


### Most_Played_Genre

In [107]:
# DF with items
ml2_user_items_exploded = ml2_user_items.explode('items')
ml2_user_items_exploded.drop(['steam_id','items_count'],axis=1,inplace=True)
ml2_user_items_exploded.reset_index(drop=True, inplace=True)
ml2_user_items_exploded.head()

Unnamed: 0,user_id,items
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike..."
1,76561197970982479,"{'item_id': '20', 'item_name': 'Team Fortress ..."
2,76561197970982479,"{'item_id': '30', 'item_name': 'Day of Defeat'..."
3,76561197970982479,"{'item_id': '40', 'item_name': 'Deathmatch Cla..."
4,76561197970982479,"{'item_id': '50', 'item_name': 'Half-Life: Opp..."


In [108]:
def get_id_alone(item):
    """
    Returns the item_id of the given item, or 0 if the item is missing or null.

    Parameters
    ----------
    item : dict or None
    A dictionary representing an item, with a key 'item_id' that holds the item's ID.

    Returns
    -------
    int
    The item_id of the item, or 0 if the item is missing or null.
    """
    if pd.isna(item):
        return 0
    id_alone = item['item_id']
    return id_alone

def get_played_time(item):

    if pd.isna(item):
        return np.nan
    played_time = item['playtime_forever']
    return played_time

ml2_user_items_exploded['item_id'] = ml2_user_items_exploded['items'].apply(get_id_alone)
ml2_user_items_exploded['played_time'] = ml2_user_items_exploded['items'].apply(get_played_time)
ml2_user_items_exploded.head()

Unnamed: 0,user_id,items,item_id,played_time
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,6.0
1,76561197970982479,"{'item_id': '20', 'item_name': 'Team Fortress ...",20,0.0
2,76561197970982479,"{'item_id': '30', 'item_name': 'Day of Defeat'...",30,7.0
3,76561197970982479,"{'item_id': '40', 'item_name': 'Deathmatch Cla...",40,0.0
4,76561197970982479,"{'item_id': '50', 'item_name': 'Half-Life: Opp...",50,0.0


In [109]:
# Selection and explostion of genres
ml2_steam_genre_id_list = ml2_steam_games_df[['id','genres']]
ml2_steam_genre_id_list= ml2_steam_genre_id_list.explode('genres')
ml2_steam_genre_id_list.drop_duplicates(inplace=True)
ml2_steam_genre_id_list.reset_index(drop=True,inplace=True)
ml2_steam_genre_id_list.head()
# print

Unnamed: 0,id,genres
0,761140.0,Action
1,761140.0,Casual
2,761140.0,Indie
3,761140.0,Simulation
4,761140.0,Strategy


In [110]:
# We make all the id's numbers
ml2_steam_genre_id_list['id'].dtype
ml2_user_items_exploded['item_id'].dtype
ml2_user_items_exploded['item_id'] = pd.to_numeric(ml2_user_items_exploded['item_id'], errors='coerce')
ml2_user_items_exploded['item_id'].astype(float)
ml2_user_items_exploded['item_id'].dtype


dtype('int64')

In [111]:
# Merge Genres And Items
merged_items_exploded_genre_id = pd.merge(ml2_user_items_exploded,ml2_steam_genre_id_list, left_on="item_id", right_on="id" )
merged_items_exploded_genre_id.head()

Unnamed: 0,user_id,items,item_id,played_time,id,genres
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,6.0,10.0,Action
1,js41637,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,Action
2,Riot-Punch,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,Action
3,doctr,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,93.0,10.0,Action
4,corrupted_soul,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,108.0,10.0,Action


In [112]:
# Select Columns for most played genre
most_played_genre = merged_items_exploded_genre_id[['user_id','played_time','id','genres']]
most_played_genre.head()

Unnamed: 0,user_id,played_time,id,genres
0,76561197970982479,6.0,10.0,Action
1,js41637,0.0,10.0,Action
2,Riot-Punch,0.0,10.0,Action
3,doctr,93.0,10.0,Action
4,corrupted_soul,108.0,10.0,Action


In [113]:
# Group by user and genre of the frame, and cleaning to get the top genre
most_played_genre_grouped = most_played_genre.groupby(["user_id", "genres"])["played_time"].sum().reset_index()
max_played_time = most_played_genre_grouped.sort_values(by=['user_id', 'played_time'], ascending=[True, False])
max_played_time = max_played_time.drop_duplicates(subset=['user_id'], keep='first')
max_played_time = max_played_time.reset_index(drop=True)
max_played_time
top_genre = max_played_time[['user_id','genres']]
top_genre.head()

Unnamed: 0,user_id,genres
0,--000--,Action
1,--ace--,Action
2,--ionex--,Action
3,-2SV-vuLB-Kg,Action
4,-404PageNotFound-,Action


In [114]:
# Save the top genre in our base Frame
ml_user_base = pd.merge(ml_user_base,top_genre,on='user_id', how = 'left')
ml_user_base.head()

Unnamed: 0,user_id,genres
0,76561197970982479,Action
1,js41637,Action
2,evcentric,Indie
3,Riot-Punch,Action
4,doctr,Action


# most_played_spec

In [115]:
# Selection and explostion of specs

ml2_steam_specs_list = ml2_steam_games_df[['id','specs']]
ml2_steam_specs_list= ml2_steam_specs_list.explode('specs')
ml2_steam_specs_list.drop_duplicates(inplace=True)
ml2_steam_specs_list.reset_index(drop=True,inplace=True)
ml2_steam_specs_list.head()

Unnamed: 0,id,specs
0,761140.0,Single-player
1,643980.0,Single-player
2,643980.0,Multi-player
3,643980.0,Online Multi-Player
4,643980.0,Cross-Platform Multiplayer


In [116]:
# Merge items exploded with specs
merged_items_exploded_specs= pd.merge(ml2_user_items_exploded,ml2_steam_specs_list, left_on="item_id", right_on="id" )
merged_items_exploded_specs.head()

Unnamed: 0,user_id,items,item_id,played_time,id,specs
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,6.0,10.0,Multi-player
1,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,6.0,10.0,Valve Anti-Cheat enabled
2,js41637,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,Multi-player
3,js41637,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,Valve Anti-Cheat enabled
4,Riot-Punch,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,Multi-player


In [117]:
# Get columns we will be using
most_specs = merged_items_exploded_specs[['user_id','specs']]
most_specs.head()

Unnamed: 0,user_id,specs
0,76561197970982479,Multi-player
1,76561197970982479,Valve Anti-Cheat enabled
2,js41637,Multi-player
3,js41637,Valve Anti-Cheat enabled
4,Riot-Punch,Multi-player


In [118]:
# Group By user and specs
most_specs_grouped = most_specs.groupby(['user_id', 'specs']).size()
most_specs_grouped = most_specs_grouped.reset_index()
most_specs_grouped.columns = ['user_id', 'spec', 'count_spec']
most_specs_grouped.head()

Unnamed: 0,user_id,spec,count_spec
0,--000--,Captions available,3
1,--000--,Co-op,23
2,--000--,Commentary available,1
3,--000--,Cross-Platform Multiplayer,8
4,--000--,Full controller support,12


In [119]:
# Sort, Drop, And reset preparing our dataframe with the most important specs
most_important_spec = most_specs_grouped.sort_values(by=['user_id', 'count_spec'], ascending=[True, False])
most_important_spec = most_important_spec.drop_duplicates(subset=['user_id'], keep='first')
most_important_spec = most_important_spec.reset_index(drop=True)
top_spec = most_important_spec[['user_id','spec']]
top_spec.head()

Unnamed: 0,user_id,spec
0,--000--,Multi-player
1,--ace--,Single-player
2,--ionex--,Steam Achievements
3,-2SV-vuLB-Kg,Single-player
4,-404PageNotFound-,Multi-player


In [120]:
# Save our most important specs in our base frame
ml_user_base = pd.merge(ml_user_base,top_spec,on='user_id', how = 'left')
ml_user_base.head()

Unnamed: 0,user_id,genres,spec
0,76561197970982479,Action,Single-player
1,js41637,Action,Single-player
2,evcentric,Indie,Single-player
3,Riot-Punch,Action,Single-player
4,doctr,Action,Single-player


# Average_Price

In [121]:
# Select columns
ml2_steam_prices = ml2_steam_games_df[['id','price']]
ml2_steam_prices.head()

Unnamed: 0,id,price
88310,761140.0,4.99
88311,643980.0,0.0
88312,670290.0,0.0
88313,767400.0,0.99
88314,773570.0,2.99


In [122]:
# Merge prices with items
merged_items_exploded_prices= pd.merge(ml2_user_items_exploded,ml2_steam_prices, left_on="item_id", right_on="id" )
merged_items_exploded_prices.head()

Unnamed: 0,user_id,items,item_id,played_time,id,price
0,76561197970982479,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,6.0,10.0,9.99
1,js41637,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,9.99
2,Riot-Punch,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,0.0,10.0,9.99
3,doctr,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,93.0,10.0,9.99
4,corrupted_soul,"{'item_id': '10', 'item_name': 'Counter-Strike...",10,108.0,10.0,9.99


In [123]:
# Select colums from the merge
average_price = merged_items_exploded_prices[['user_id','price']]
average_price.head()

Unnamed: 0,user_id,price
0,76561197970982479,9.99
1,js41637,9.99
2,Riot-Punch,9.99
3,doctr,9.99
4,corrupted_soul,9.99


In [124]:
# We get the mean of prices
average_price_grouped = average_price.groupby('user_id')['price'].mean()
average_price_grouped = average_price_grouped.reset_index()
average_price_grouped.head()

Unnamed: 0,user_id,price
0,--000--,8.391042
1,--ace--,4.652632
2,--ionex--,6.047895
3,-2SV-vuLB-Kg,8.348679
4,-404PageNotFound-,13.654286


In [125]:
# Add the mean of prices to our base
ml_user_base = pd.merge(ml_user_base,average_price_grouped,on='user_id', how = 'left')
ml_user_base.head()

Unnamed: 0,user_id,genres,spec,price
0,76561197970982479,Action,Single-player,14.802974
1,js41637,Action,Single-player,10.968072
2,evcentric,Indie,Single-player,12.55811
3,Riot-Punch,Action,Single-player,12.462255
4,doctr,Action,Single-player,14.750587


 # most_reviewd_genre

In [126]:
# Selection of columns from reviews
ml2_user_review_item = ml2_user_reviews[['user_id','item_id','sentiment']]
ml2_user_review_item['item_id']= pd.to_numeric(ml2_user_items_exploded['item_id'], errors='coerce')
ml2_user_review_item['item_id'].astype(float)
ml2_user_review_item['item_id'].dtype
ml2_user_review_item.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ml2_user_review_item['item_id']= pd.to_numeric(ml2_user_items_exploded['item_id'], errors='coerce')


Unnamed: 0,user_id,item_id,sentiment
0,76561197970982479,10,0.8481
1,76561197970982479,20,0.2263
2,76561197970982479,30,0.9117
3,js41637,40,0.9566
4,js41637,50,0.9708


In [127]:
ml2_steam_genre_id_list.head()

Unnamed: 0,id,genres
0,761140.0,Action
1,761140.0,Casual
2,761140.0,Indie
3,761140.0,Simulation
4,761140.0,Strategy


In [128]:
# Merge user reviews with genres to get the top reviewd genre
merged_genre_sentiment= pd.merge(ml2_user_review_item,ml2_steam_genre_id_list, left_on="item_id", right_on="id" )
merged_genre_sentiment.head()

Unnamed: 0,user_id,item_id,sentiment,id,genres
0,76561197970982479,10,0.8481,10.0,Action
1,DemonicWolvz,10,0.204,10.0,Action
2,ThePyrite99,10,0.7351,10.0,Action
3,getyourmarsbar,10,0.0,10.0,Action
4,76561198031390758,10,0.2942,10.0,Action


In [129]:
# Select columns
user_genre_sentiment = merged_genre_sentiment[['user_id','sentiment','genres']]
user_genre_sentiment.head()

Unnamed: 0,user_id,sentiment,genres
0,76561197970982479,0.8481,Action
1,DemonicWolvz,0.204,Action
2,ThePyrite99,0.7351,Action
3,getyourmarsbar,0.0,Action
4,76561198031390758,0.2942,Action


In [130]:
# Prepare our group with the most liked genre
user_genre_sentiment_grouped = user_genre_sentiment.groupby(['user_id', 'genres']).sum()
user_genre_sentiment_grouped = user_genre_sentiment_grouped.reset_index()
user_genre_sentiment_grouped.columns = ['user_id', 'genres', 'sentiment_sum']
user_genre_sentiment_grouped.head()

Unnamed: 0,user_id,genres,sentiment_sum
0,--000--,Adventure,0.0
1,--000--,Free to Play,0.0
2,--000--,Indie,0.0
3,--000--,RPG,0.0
4,--ace--,Action,1.3633


In [131]:
# Order our column
most_sentiment_genre = user_genre_sentiment_grouped.sort_values(by=['user_id', 'sentiment_sum'], ascending=[True, False])
most_sentiment_genre = most_sentiment_genre.drop_duplicates(subset=['user_id'], keep='first')
most_sentiment_genre = most_sentiment_genre.reset_index(drop=True)
top_sentiment = most_sentiment_genre[['user_id','genres']]
top_sentiment.head()

Unnamed: 0,user_id,genres
0,--000--,Adventure
1,--ace--,Action
2,--ionex--,Action
3,-2SV-vuLB-Kg,Adventure
4,-Azsael-,Casual


In [132]:
" Save our column to our base"
ml_user_base = pd.merge(ml_user_base,top_sentiment,on='user_id', how = 'left')
ml_user_base.head()


Unnamed: 0,user_id,genres_x,spec,price,genres_y
0,76561197970982479,Action,Single-player,14.802974,Action
1,js41637,Action,Single-player,10.968072,Action
2,evcentric,Indie,Single-player,12.55811,Action
3,Riot-Punch,Action,Single-player,12.462255,
4,doctr,Action,Single-player,14.750587,RPG


In [133]:
# There are 2 genres we give them meaningful names
ml_user_base = ml_user_base.rename(columns = {'genres_y':'genre_liked'})
ml_user_base = ml_user_base.rename(columns = {'genres_x':'genre_played'})
ml_user_base.head()

Unnamed: 0,user_id,genre_played,spec,price,genre_liked
0,76561197970982479,Action,Single-player,14.802974,Action
1,js41637,Action,Single-player,10.968072,Action
2,evcentric,Indie,Single-player,12.55811,Action
3,Riot-Punch,Action,Single-player,12.462255,
4,doctr,Action,Single-player,14.750587,RPG


In [None]:
# Show our final Dataframe
ml_user_base.head()

In [134]:
ml_user_base.to_parquet('.\\Data\\\ML\\ml2_users_items.parquet')