In [1]:
import pandas as pd

# Hypothesis 3 



# Dealing with Game Genres
 A row for each game, a column for each game genre

In [2]:
# load the game-info dataframe and keep only appid and genres 
ga = pd.read_csv("./buruf-datasets/game-info/steam.csv")
ga = ga[['appid', 'genres']]

In [3]:
# split genres string into a list
ga['genres'] = ga.genres.str.split(';').apply(lambda x : x)

In [4]:
ga

Unnamed: 0,appid,genres
0,10,[Action]
1,20,[Action]
2,30,[Action]
3,40,[Action]
4,50,[Action]
...,...,...
27070,1065230,"[Adventure, Casual, Indie]"
27071,1065570,"[Action, Adventure, Indie]"
27072,1065650,"[Action, Casual, Indie]"
27073,1066700,"[Adventure, Casual, Indie]"


In [5]:
# get all unique genres
all_genres = ga.genres.apply(pd.Series).stack().reset_index(drop=True).unique()
len(all_genres)

29

In [6]:
# a dict for each game, each dict has all of the genres, val=1 if genre applies to game
gen_dicts = ga.genres.apply(lambda gens : { **{g:0 for g in all_genres}, **{ g:1 for g in gens } } )

In [7]:
# delete now useless column
ga = ga.drop(columns=['genres'])

In [8]:
# merge with original
ga = pd.concat( [ga.reset_index(drop=True), gen_dicts.apply(pd.Series).reset_index(drop=True)] , axis=1)

In [9]:
ga.head(2)

Unnamed: 0,appid,Action,Free to Play,Strategy,Adventure,Indie,RPG,Animation & Modeling,Video Production,Casual,...,Web Publishing,Education,Software Training,Sexual Content,Audio Production,Game Development,Photo Editing,Accounting,Documentary,Tutorial
0,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Dealing with Reviews
Only keep reviews by players who authored more than one.

In [10]:
df = pd.read_json("./small_reviews.json",  lines=True)

In [11]:
# Unbox Mongodb useless extra object
df['steamid'] = df.steamid.apply(lambda x : int(x['$numberLong']))

In [12]:
# Remove useless Mongodb _id
df = df.drop(columns=['_id'])

In [13]:
# only keep useful columns
df = df[['appid', 'steamid', 'voted_up']]

In [14]:
# how many unique players are there?
len(df.steamid.unique())

290380

In [15]:
# how many players that wrote at least more than one review?
df.steamid.duplicated().sum()

18144

In [16]:
df = df.reset_index(drop=True)

In [17]:
df.voted_up.value_counts()

True     268066
False     40458
Name: voted_up, dtype: int64

# Merge the two datasets

In [18]:
df = df.merge(ga, on='appid')

In [19]:
len(df) # check same as df.steamid.duplicated().sum() if all games present in ga

308524

In [20]:
len(df)

308524

In [21]:
df

Unnamed: 0,appid,steamid,voted_up,Action,Free to Play,Strategy,Adventure,Indie,RPG,Animation & Modeling,...,Web Publishing,Education,Software Training,Sexual Content,Audio Production,Game Development,Photo Editing,Accounting,Documentary,Tutorial
0,204100,76561198255525846,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,204100,76561198058159765,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,204100,76561198372464367,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,204100,76561198126769984,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,204100,76561198090877508,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308519,598980,76561198065227706,True,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308520,598980,76561198153693572,True,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308521,597970,76561198030747313,True,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308522,597970,76561197985682990,True,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# (A parte)
#### apprzzamnto_genere = num_rev_pos_genere / num_rev_genere


In [32]:
tmp =  df[df.voted_up].sum() / df.sum() 
tmp  = tmp[3:]
tmp = 100*tmp.sort_values(ascending=False)
tmp

Action                   100.0
Free to Play             100.0
Game Development         100.0
Audio Production         100.0
Sexual Content           100.0
Software Training        100.0
Education                100.0
Web Publishing           100.0
Design & Illustration    100.0
Utilities                100.0
Gore                     100.0
Early Access             100.0
Sports                   100.0
Nudity                   100.0
Massively Multiplayer    100.0
Violent                  100.0
Racing                   100.0
Simulation               100.0
Casual                   100.0
Video Production         100.0
Animation & Modeling     100.0
RPG                      100.0
Indie                    100.0
Adventure                100.0
Strategy                 100.0
Photo Editing            100.0
Accounting                 NaN
Documentary                NaN
Tutorial                   NaN
dtype: float64

# Hypothesis 3

In [23]:
df = df[df.voted_up]

In [24]:
# players that have reviewed at least N games
N = 4
tm1 = df.steamid.value_counts()#.sort_values(ascending=False)
n_ga_players = tm1[df.steamid.value_counts() >= N].index
len(n_ga_players)

386

In [25]:
marchese =  df[df.steamid.apply(lambda x : x in n_ga_players)]

In [26]:
marchese = marchese.groupby('steamid').sum()
marchese = marchese.reset_index(drop=True)

In [27]:
marchese['max'] = marchese.iloc[:, 2:].max(axis=1)

In [28]:
marchese['ratio_'] = 100*marchese['max']/marchese.voted_up

In [29]:
marchese.ratio_

0       75.000000
1       83.333333
2       80.000000
3       62.500000
4       75.000000
          ...    
381     75.000000
382     60.000000
383    100.000000
384    100.000000
385    100.000000
Name: ratio_, Length: 386, dtype: float64

In [30]:
import scipy.stats as stats
import math

# Specify the sample mean (x_bar), the sample standard deviation (s), the mean claimed in the null-hypothesis (mu_null), and the sample size (n)
x_bar = marchese.ratio_.mean()
s = marchese.ratio_.std()
mu_null = 70
n = len(marchese.ratio_)

x_bar, s, mu_null, n


# Calculate the test statistic
test_stat = (x_bar - mu_null)/(s/math.sqrt(n))

# # Output the p-value of the test statistic (right tailed test)
p =  1-stats.t.cdf(test_stat, n-1)

test_stat, p

(8.259844186349728, 1.2212453270876722e-15)

In [31]:
# RECYCLED SNIPPETS
# df["playtime_after_review"] = df.playtime_forever - df.playtime_at_review
# from scipy.stats import spearmanr
# x = df.voted_up.astype(int)
# y = df.playtime_after_review
# r, p = spearmanr(x, y, alternative="greater")
# r, p
# df.playtime_after_review.quantile(q=0.75)