In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# Read the Parquet file into a PyArrow Table

df_reviews= pd.read_parquet(r"C:\Users\m_evi\OneDrive\Desktop\PI01\CleanDatasets\df_reviews_l.parquet")
df_genre_ranking = pd.read_parquet(r"C:\Users\m_evi\OneDrive\Desktop\PI01\CleanDatasets\df_genre_ranking.parquet")
df_playtime = pd.read_parquet(r"C:\Users\m_evi\OneDrive\Desktop\PI01\CleanDatasets\df_playtime.parquet")
df_funct_dev = pd.read_parquet(r"C:\Users\m_evi\OneDrive\Desktop\PI01\CleanDatasets\df_funct_dev.parquet")
df_expenses_items = pd.read_parquet(r"C:\Users\m_evi\OneDrive\Desktop\PI01\CleanDatasets\df_expenses_items.parquet")

In [3]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49714 entries, 0 to 49713
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             49714 non-null  object
 1   user_url            49714 non-null  object
 2   sentiment_analysis  49714 non-null  int64 
 3   posted              49714 non-null  object
 4   item_id             49714 non-null  int64 
 5   recommend           49714 non-null  bool  
 6   release_year        49714 non-null  object
 7   developer           49714 non-null  object
dtypes: bool(1), int64(2), object(5)
memory usage: 2.7+ MB


# 01.2 Developer Function

This function returns the number of items and the percentage of free content per year according to the developing company.

In [22]:
df_funct_dev.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28851 entries, 0 to 71552
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         28851 non-null  float64
 1   release_year  28851 non-null  object 
 2   developer     28851 non-null  object 
 3   item_id       28851 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 1.1+ MB


In [27]:
def developer(developer_name: str):
    # Filter games by the specified developer
    filtered_developer = df_funct_dev[df_funct_dev['developer'] == developer_name]

    # Count the number of games released per year
    game_count_by_year = filtered_developer.groupby('release_year')['item_id'].count()

    # Calculate the percentage of free games released per year
    free_games_percentage = (filtered_developer[filtered_developer['price'] == 0.0]
                             .groupby('release_year')['item_id']
                             .count() / game_count_by_year * 100).fillna(0).astype(int)

    # Create a DataFrame from the results dictionary
    results_df = pd.DataFrame({
        'Year': game_count_by_year.index,
        'Number of games': game_count_by_year.values,
        '% Free games': free_games_percentage.values
    })

    return results_df


In [28]:
developer("Kotoshiro")

Unnamed: 0,Year,Number of games,% Free games
0,2018,1,0


In [29]:

developer("Laush Dmitriy Sergeevich")

Unnamed: 0,Year,Number of games,% Free games
0,2017,9,0
1,2018,1,0


Examples

In [30]:
# Call the function 
result = developer("Kotoshiro")
result2 = developer ("Laush Dmitriy Sergeevich")
result3 = developer("Poolians.com")

# Print the result
print(result)
print(result2)
print(result3)

   Year  Number of games  % Free games
0  2018                1             0
   Year  Number of games  % Free games
0  2017                9             0
1  2018                1             0
   Year  Number of games  % Free games
0  2017                1           100


# 02.2 User_Data Function


This function provides a summary of the amount of money spent by the user, the percentage of recommendation based on `reviews.recommend`, and the quantity of items.

In [32]:
def userdata(user_id):
    
    # Filter by the user of interest
    user = df_reviews[df_reviews['user_id'] == user_id]
    # Calculate the amount of money spent for the user of interest
    amount_money = df_expenses_items[df_expenses_items['user_id']== user_id]['price'].sum()
   
    # Search for the count_item for the user of interest    
    count_items = df_expenses_items[df_expenses_items['user_id']== user_id]['items_count'].iloc[0]
    
    # Calculate the total recommendations made by the user of interest
    total_recommendations = user['recommend'].sum()
    # Calculate the total reviews made by all users
    total_reviews = len(df_reviews['user_id'].unique())
    # Calculate the percentage of recommendations made by the user of interest
    percentage_recommendations = (total_recommendations / total_reviews) * 100
    
    return {
        'user_id': user_id,
        'amount_money': amount_money,
        'percentage_recommendation': round(percentage_recommendations, 2),
        'total_items': count_items.astype(int)
    }

EJEMPLOS

In [33]:
user_id = 'EchoXSilence'
userdata(user_id)

{'user_id': 'EchoXSilence',
 'amount_money': 189.84,
 'percentage_recommendation': 0.0,
 'total_items': 23}

In [34]:
user_id1 = "js41637"
userdata(user_id1)

{'user_id': 'js41637',
 'amount_money': 8489.14,
 'percentage_recommendation': 0.01,
 'total_items': 888}

In [35]:
# Replace "your user id here" with the ID of the user you want to query
user_id2 = "76561197970982479"

# Call the userdata function with the specific user ID
userdata(user_id2)


{'user_id': '76561197970982479',
 'amount_money': 3419.32,
 'percentage_recommendation': 0.01,
 'total_items': 277}

# 03.2 UserForGenre Function

Returns the user who has accumulated the most playtime for the given genre,
along with a list of playtime accumulation by release year.

In [36]:
df_playtime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9877304 entries, 0 to 9877303
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   playtime_forever  float64
 1   user_id           object 
 2   item_id           int32  
 3   genres            object 
 4   release_year      object 
dtypes: float64(1), int32(1), object(3)
memory usage: 339.1+ MB


In [37]:
def UserForGenre(genre: str) -> dict:
    # Filter the DataFrame for the specific genre
    genre_df = df_playtime[df_playtime['genres'] == genre]
    
    # Convert playtime from minutes to hours
    genre_df['playtime_forever'] = genre_df['playtime_forever'] / 60
    
    # Group by user_id and sum the playtime_forever
    user_playtime = genre_df.groupby('user_id')['playtime_forever'].sum()
    
    # Get the user with the most playtime
    top_user = user_playtime.idxmax()
    
    # Filter the DataFrame for the top user and the specific genre
    top_user_genre_df = genre_df[genre_df['user_id'] == top_user]
    
    # Group by release_year and sum the playtime_forever
    playtime_by_year = top_user_genre_df.groupby('release_year')['playtime_forever'].sum()
    
    # Prepare the playtime list
    playtime_list = []
    for year, playtime in playtime_by_year.items():
        try:
            # Try to convert the year to an integer
            year_int = int(year)
            # Convert playtime to a native Python float
            playtime_float = float(playtime)
            playtime_list.append({"Year": year_int, "Hours": playtime_float})
        except ValueError:
            # Skip rows where year is not a numeric value
            continue
    
    return {
        "User with most playtime for Genre {}".format(genre): str(top_user),
        "Playtime": playtime_list
    }

Examples

In [38]:
genre = 'Simulation'
UserForGenre(genre)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_df['playtime_forever'] = genre_df['playtime_forever'] / 60


{'User with most playtime for Genre Simulation': 'jimmynoe',
 'Playtime': [{'Year': 2003, 'Hours': 0.16666666666666666},
  {'Year': 2006, 'Hours': 9195.316666666668},
  {'Year': 2007, 'Hours': 0.0},
  {'Year': 2009, 'Hours': 0.25},
  {'Year': 2010, 'Hours': 0.0},
  {'Year': 2011, 'Hours': 6967.383333333334},
  {'Year': 2012, 'Hours': 26.533333333333335},
  {'Year': 2013, 'Hours': 29.183333333333334},
  {'Year': 2014, 'Hours': 13.983333333333333},
  {'Year': 2015, 'Hours': 1230.2},
  {'Year': 2016, 'Hours': 239.14999999999998},
  {'Year': 2017, 'Hours': 0.0}]}

In [39]:
genre1 = 'Indie'
UserForGenre(genre1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_df['playtime_forever'] = genre_df['playtime_forever'] / 60


{'User with most playtime for Genre Indie': 'REBAS_AS_F-T',
 'Playtime': [{'Year': 1999, 'Hours': 0.0},
  {'Year': 2001, 'Hours': 0.18333333333333332},
  {'Year': 2003, 'Hours': 31.05},
  {'Year': 2005, 'Hours': 0.0},
  {'Year': 2006, 'Hours': 27.883333333333333},
  {'Year': 2007, 'Hours': 17.833333333333332},
  {'Year': 2008, 'Hours': 22.766666666666666},
  {'Year': 2009, 'Hours': 483.2166666666667},
  {'Year': 2010, 'Hours': 358.1166666666667},
  {'Year': 2011, 'Hours': 1669.25},
  {'Year': 2012, 'Hours': 2474.3166666666666},
  {'Year': 2013, 'Hours': 2822.483333333333},
  {'Year': 2014, 'Hours': 5448.783333333334},
  {'Year': 2015, 'Hours': 12529.416666666666},
  {'Year': 2016, 'Hours': 13572.883333333333},
  {'Year': 2017, 'Hours': 564.7833333333333}]}

# 04.2 BestDevYear Function

Returns the top 3 developers with the HIGHEST user-recommended games for the given year.

In [40]:
def best_developer_year(year: int):
    # Replace non-numeric 'release_year' values with NaN
    df_reviews['release_year'] = pd.to_numeric(df_reviews['release_year'], errors='coerce')
    
    # Filter DataFrame by the given year and positive recommendations
    df_filtered = df_reviews[(df_reviews['release_year'] == year) & (df_reviews['recommend'] == True) & (df_reviews['sentiment_analysis'] == 2)]
    
    # Group by developer and count the recommendations
    df_grouped = df_filtered.groupby('developer').size()
    
    # Get the top 3 developers
    top_developers = df_grouped.nlargest(3).index.tolist()
    
    # Prepare the result in the desired format
    result = [{"Rank {}".format(i+1): dev} for i, dev in enumerate(top_developers)]
    
    return result

Examples

In [41]:
year=2009
best_developer_year(year)

[{'Rank 1': 'Valve'},
 {'Rank 2': 'Tripwire Interactive'},
 {'Rank 3': 'Infinity Ward,Aspyr (Mac)'}]

In [42]:
year1=2012
best_developer_year(year1)

[{'Rank 1': 'Valve'},
 {'Rank 2': 'Gearbox Software,Aspyr (Mac &amp; Linux)'},
 {'Rank 3': 'Daybreak Game Company'}]

# 05.2 DevReviewsAnalysis Function

Returns a dictionary with the developer's name as the key and a list of total review records
from users categorized with positive sentiment analysis as the value. Negative and Positive😊

In [43]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49714 entries, 0 to 49713
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             49714 non-null  object 
 1   user_url            49714 non-null  object 
 2   sentiment_analysis  49714 non-null  int64  
 3   posted              49714 non-null  object 
 4   item_id             49714 non-null  int64  
 5   recommend           49714 non-null  bool   
 6   release_year        49430 non-null  float64
 7   developer           49714 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 2.7+ MB


In [44]:
def dev_reviews_analysis(developer):
    # Filter the reviews for the specific developer
    reviews2 = df_reviews[df_reviews['developer'] == str(developer)]  # Convert developer name to a string
    
    # Initialize a dictionary to count the sentiment categories
    sentiment_counts = {'Negative': 0, 'Positive': 0}
    
    # Iterate through the reviews of the specified developer
    for index, row in reviews2.iterrows():
        sentiment = row['sentiment_analysis']
        sentiment_category = ''
        
        # Assign the corresponding sentiment category
        if sentiment == 0:
            sentiment_category = 'Negative'
        elif sentiment == 2:
            sentiment_category = 'Positive'
        else:
            # Skip reviews with missing or neutral sentiment
            continue
        
        # Increment the corresponding counter in the dictionary
        sentiment_counts[sentiment_category] += 1
    
    
    return {"developer": developer, "sentiment_counts": sentiment_counts}

EXAMPLE

In [45]:
developer = "Trion Worlds, Inc."
dev_reviews_analysis(developer)

{'developer': 'Trion Worlds, Inc.',
 'sentiment_counts': {'Negative': 9, 'Positive': 15}}