In [4]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
# Read the Parquet file into a PyArrow Table
df_reviews= pd.read_parquet(r"C:\My Project\Integ\MLInt\Datasets\CleanDatasets\df_reviews_l.parquet")
df_genre_ranking = pd.read_parquet(r"C:\My Project\Integ\MLInt\Datasets\CleanDatasets\df_genre_ranking.parquet")
df_playtime = pd.read_parquet(r"C:\My Project\Integ\MLInt\Datasets\CleanDatasets\df_playtime.parquet")
df_funct_dev = pd.read_parquet(r"C:\My Project\Integ\MLInt\Datasets\CleanDatasets\df_funct_dev.parquet")
df_expenses_items = pd.read_parquet(r"C:\My Project\Integ\MLInt\Datasets\CleanDatasets\df_expenses_items.parquet")

# 01.2 Developer Function

The function takes the game developer as a parameter, and returns the number of items developed per year and the amount of free content per year.

In [6]:
def developer(developer_name: str):
    # Filter games by the specified developer
    filtered_developer = df_funct_dev[df_funct_dev['developer'] == developer_name]

    # Count the number of games released per year
    game_count_by_year = filtered_developer.groupby('release_year')['item_id'].count()

    # Calculate the percentage of free games released per year
    free_games_percentage = (filtered_developer[filtered_developer['price'] == 0.0]
                             .groupby('release_year')['item_id']
                             .count() / game_count_by_year * 100).fillna(0).astype(int)

    # Create a DataFrame from the results dictionary
    results_df = pd.DataFrame({
        'Year': game_count_by_year.index,
        'Number of games': game_count_by_year.values,
        '% Free games': free_games_percentage.values
    })

    return results_df


Examples

In [7]:
# Call the function 
result = developer("Kotoshiro")
result2 = developer ("Laush Dmitriy Sergeevich")
result3 = developer("Poolians.com")

# Print the result
print(result)
print(result2)
print(result3)

   Year  Number of games  % Free games
0  2018                1             0
   Year  Number of games  % Free games
0  2017                9             0
1  2018                1             0
   Year  Number of games  % Free games
0  2017                1           100


# 02.2 User_Data Function


This function provides a summary of a user’s activity on Steam, including how much money they’ve spent, what percentage of games they’ve recommended, and how many games they own. Please replace ‘user_id’, ‘df_reviews’, ‘item_id’, ‘df_games’, ‘price’, ‘recommend’, ‘money_spent’, ‘recommend_percentage’, and ‘num_items’ with your actual column and key names. 

In [6]:
def userdata(user_id):
    
    # Filter by the user of interest
    user = df_reviews[df_reviews['user_id'] == user_id]
    # Calculate the amount of money spent for the user of interest
    amount_money = df_expenses_items[df_expenses_items['user_id']== user_id]['price'].sum()
   
    # Search for the count_item for the user of interest    
    count_items = df_expenses_items[df_expenses_items['user_id']== user_id]['items_count'].iloc[0]
    
    # Calculate the total recommendations made by the user of interest
    total_recommendations = user['recommend'].sum()
    # Calculate the total reviews made by all users
    total_reviews = len(df_reviews['user_id'].unique())
    # Calculate the percentage of recommendations made by the user of interest
    percentage_recommendations = (total_recommendations / total_reviews) * 100
    
    return {
        'amount_money': amount_money,
        'percentage_recommendation': round(percentage_recommendations, 2),
        'total_items': count_items.astype(int)
    }

EJEMPLOS

In [7]:
user_id = 'EchoXSilence'
userdata(user_id)

{'amount_money': 189.84, 'percentage_recommendation': 0.0, 'total_items': 23}

In [8]:
user_id1 = "js41637"
userdata(user_id1)

{'amount_money': 8489.14,
 'percentage_recommendation': 0.01,
 'total_items': 888}

In [9]:
# Replace "your user id here" with the ID of the user you want to query
user_id2 = "76561197970982479"

# Call the userdata function with the specific user ID
userdata(user_id2)


{'amount_money': 3419.32,
 'percentage_recommendation': 0.01,
 'total_items': 277}

# 03.2 UserForGenre Function

This function takes the genre of a video game as a parameter and returns the top 5 users with the most gameplay hours in the specified genre. It provides the user ID and their profile URL.

In [10]:
def userForGenre(genre):
    
    # Filter the dataframe by the genre of interest
    data_by_genre = df_playtime[df_playtime['genres'] == genre]
    # Group the filtered dataframe by user and sum the number of hours
    top_users = data_by_genre.groupby(['user_url', 'user_id'])['playtime_hours'].sum().nlargest(5).reset_index()
    
    # An empty dictionary is made to store the needed data
    top_users_dict = {}
    for index, row in top_users.iterrows():
        # User info goes through each row of the top 5 and saves it in the dictionary
        user_info = {
            'user_id': row['user_id'],
            'user_url': row['user_url']
        }
        top_users_dict[index + 1] = user_info
    
    return top_users_dict

Examples

In [11]:
genre = 'Action'
userForGenre(genre)

{1: {'user_id': 'Sp3ctre', 'user_url': 'http://steamcommunity.com/id/Sp3ctre'},
 2: {'user_id': 'shinomegami',
  'user_url': 'http://steamcommunity.com/id/shinomegami'},
 3: {'user_id': 'REBAS_AS_F-T',
  'user_url': 'http://steamcommunity.com/id/REBAS_AS_F-T'},
 4: {'user_id': 'Terminally-Chill',
  'user_url': 'http://steamcommunity.com/id/Terminally-Chill'},
 5: {'user_id': 'DownSyndromeKid',
  'user_url': 'http://steamcommunity.com/id/DownSyndromeKid'}}

In [12]:
genre1 = 'Indie'
userForGenre(genre1)

{1: {'user_id': 'REBAS_AS_F-T',
  'user_url': 'http://steamcommunity.com/id/REBAS_AS_F-T'},
 2: {'user_id': 'jimmynoe',
  'user_url': 'http://steamcommunity.com/id/jimmynoe'},
 3: {'user_id': 'shinomegami',
  'user_url': 'http://steamcommunity.com/id/shinomegami'},
 4: {'user_id': 'Steamified',
  'user_url': 'http://steamcommunity.com/id/Steamified'},
 5: {'user_id': 'idonothack',
  'user_url': 'http://steamcommunity.com/id/idonothack'}}

# 04.2 BestDevYear Function

In [13]:
def best_developer_year(year: int):
    # Replace non-numeric 'release_year' values with NaN
    df_reviews['release_year'] = pd.to_numeric(df_reviews['release_year'], errors='coerce')
    
    # Filter DataFrame by the given year and positive recommendations
    df_filtered = df_reviews[(df_reviews['release_year'] == year) & (df_reviews['recommend'] == True) & (df_reviews['sentiment_analysis'] == 2)]
    
    # Group by developer and count the recommendations
    df_grouped = df_filtered.groupby('developer').size()
    
    # Get the top 3 developers
    top_developers = df_grouped.nlargest(3).index.tolist()
    
    return {"Top 3 Developers": top_developers}

Examples

In [14]:
year=2009
best_developer_year(year)

{'Top 3 Developers': ['Valve',
  'Tripwire Interactive',
  'Infinity Ward,Aspyr (Mac)']}

In [15]:
year1=2012
best_developer_year(year1)

{'Top 3 Developers': ['Valve',
  'Gearbox Software,Aspyr (Mac &amp; Linux)',
  'Daybreak Game Company']}

# 05.2 DevReviewsAnalysis Function

This function takes the release year of a game as a parameter and, based on that year, returns a list with the number of user review records that are categorized with a sentiment analysis, such as Negative, Neutral, and Positive. 😊

In [41]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49714 entries, 0 to 49713
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             49714 non-null  object 
 1   user_url            49714 non-null  object 
 2   sentiment_analysis  49714 non-null  int64  
 3   posted              49714 non-null  object 
 4   item_id             49714 non-null  int64  
 5   recommend           49714 non-null  bool   
 6   release_year        49430 non-null  float64
 7   developer           49714 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 2.7+ MB


In [21]:
def dev_reviews_analysis(developer):
    # Filter the reviews for the specific developer
    reviews2 = df_reviews[df_reviews['developer'] == str(developer)]  # Convert developer name to a string
    
    # Initialize a dictionary to count the sentiment categories
    sentiment_counts = {'Negative': 0, 'Positive': 0}
    
    # Iterate through the reviews of the specified developer
    for index, row in reviews2.iterrows():
        sentiment = row['sentiment_analysis']
        sentiment_category = ''
        
        # Assign the corresponding sentiment category
        if sentiment == 0:
            sentiment_category = 'Negative'
        elif sentiment == 2:
            sentiment_category = 'Positive'
        else:
            # Skip reviews with missing or neutral sentiment
            continue
        
        # Increment the corresponding counter in the dictionary
        sentiment_counts[sentiment_category] += 1
    
    
    return {"developer": developer, "sentiment_counts": sentiment_counts}

EXAMPLE

In [22]:
developer = "Trion Worlds, Inc."
dev_reviews_analysis(developer)

{'developer': 'Trion Worlds, Inc.',
 'sentiment_counts': {'Negative': 9, 'Positive': 15}}