In [1]:
# imports
import pandas as pd, pymongo, pandas as pd, ipywidgets as widgets, scipy.stats as stats, math

# Third Hypotesis:

In the following notebook it will be analyzed the following hypotesis:
 - **Are people more loyal to a specific game genre** *(no matter which type of genre)***, or do they like changing it?**

## Local analysis

First we'll consider a small set of data, in order to locally perform the analysis using standard python and its libraries.<br>
If relevant results are obtained, the analysis will be replicated by using Big Data tools (HDFS, Spark, ...).

### Obtaining games genre/s

- **TODO:** Read the games data not directly from .csv but from a .json 

In [2]:
# load the games dataset and keep only appid and genres 
games_df = pd.read_csv("../../datasets/games/steam.csv")
games_df = games_df[['appid', 'genres']]

In [3]:
# split genres string into a list
games_df['genres'] = games_df.genres.str.split(';').apply(lambda x : x)

In [4]:
games_df

Unnamed: 0,appid,genres
0,10,[Action]
1,20,[Action]
2,30,[Action]
3,40,[Action]
4,50,[Action]
...,...,...
27070,1065230,"[Adventure, Casual, Indie]"
27071,1065570,"[Action, Adventure, Indie]"
27072,1065650,"[Action, Casual, Indie]"
27073,1066700,"[Adventure, Casual, Indie]"


#### Create a dataframe that contains for each game a 1 on the column of a specific genre if the game is of that genre

It is a sort of one hot encoding, but a game can at least have one genre

In [5]:
# get all unique genres
all_genres = games_df.genres.apply(pd.Series).stack().reset_index(drop=True).unique()
# a dict for each game, each dict has all of the genres, val=1 if genre applies to game
gen_dicts = games_df.genres.apply(lambda gens : { **{g:0 for g in all_genres}, **{ g:1 for g in gens } } )
# delete now useless column
del games_df['genres']
# merge with original
games_df = pd.concat( [games_df.reset_index(drop=True), gen_dicts.apply(pd.Series).reset_index(drop=True)] , axis=1)

In [6]:
games_df

Unnamed: 0,appid,Action,Free to Play,Strategy,Adventure,Indie,RPG,Animation & Modeling,Video Production,Casual,...,Web Publishing,Education,Software Training,Sexual Content,Audio Production,Game Development,Photo Editing,Accounting,Documentary,Tutorial
0,10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
27071,1065570,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27072,1065650,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
27073,1066700,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Obtaining useful reviews data

In [7]:
# Connect to the mongo local and load as a dataframe the small_reviews collection
mongo = pymongo.MongoClient()
mongo_db = mongo.final_project
# only keep useful data
small_reviews_df = pd.DataFrame(list(mongo_db.small_reviews.find({}, {'_id': False})))
mongo.close() #Close the connection

In [8]:
# only keep useful data
small_reviews_df = small_reviews_df[['appid', 'steamid', 'voted_up']]

### Merge reviews and games genre/s datasets

In [9]:
entire_df = small_reviews_df.merge(games_df, on='appid')
entire_df

Unnamed: 0,appid,steamid,voted_up,Action,Free to Play,Strategy,Adventure,Indie,RPG,Animation & Modeling,...,Web Publishing,Education,Software Training,Sexual Content,Audio Production,Game Development,Photo Editing,Accounting,Documentary,Tutorial
0,204100,76561198058159765,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,204100,76561198126769984,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,204100,76561198372464367,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,204100,76561198255525846,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,204100,76561198090877508,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308519,598980,76561198945436777,False,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308520,598980,76561198065227706,True,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308521,597970,76561198030747313,True,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
308522,597970,76561197985682990,True,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### (ASIDE) Is a specific genre more appreciated with respect to others? 
The metric useful for that analysis is the following: 
 - **genre_appreciation = (num_pos_reviews_genre / num_reviews_genre) * 100**


In [10]:
genre_appreciation = round(((entire_df[entire_df.voted_up].sum() / entire_df.sum()[3:]).sort_values(ascending=False) * 100), 2)
genre_appreciation

Animation & Modeling     97.81
Design & Illustration    97.38
Utilities                96.07
Photo Editing            90.00
Indie                    89.27
Simulation               88.27
Casual                   88.12
RPG                      87.34
Web Publishing           87.21
Strategy                 86.63
Racing                   86.43
Early Access             86.31
Video Production         86.21
Sports                   86.06
Audio Production         85.71
Action                   85.71
Free to Play             85.31
Adventure                84.19
Nudity                   81.90
Gore                     81.04
Game Development         80.00
Violent                  79.56
Sexual Content           78.25
Massively Multiplayer    73.15
Education                71.70
Software Training        70.21
Accounting                 NaN
Documentary                NaN
Tutorial                   NaN
appid                      NaN
steamid                    NaN
voted_up                   NaN
dtype: f

### Analysis of the hypothesis

The first objective is to find for the users (that have written at least N positive reviews) which are their relative fidelity level to the more liked genre, which has been defined as:
 - **user_fidelity = (more_reviewed_genre / num_pos_reviews) * 100**

<br> Once obtained that fidelity level for each user, we want to know if at least a percentage of users that have written at least N positive reviews are all loyal to the same genre (no matter which one). <br>
In order to have a measure on the statistical significance of that hypotesis a *Mean Test* has been performed.<br>
This test (fixed M) works as follows:
 - H0: mu = M%
 - H1: mu >= M%
 
Whereas the significance level is a percentage probability of accidentally making the wrong conclusion.<br>
For more details read the following link:
 - https://www.w3schools.com/statistics/statistics_hypothesis_testing_mean.php

In [11]:
# Select only the positive reviews
entire_df = entire_df[entire_df.voted_up]

In [12]:
# players that have positively reviewed at least N games (N is selectable)
written_reviews_each_user = entire_df.steamid.value_counts()
n_pos_reviews_slider = widgets.IntSlider(min=1, max=(written_reviews_each_user.max()), value=4)

In [13]:
steamid_written_n_pos_reviews = entire_df.steamid.value_counts()[entire_df.steamid.value_counts() >= n_pos_reviews_slider.value].index
len(steamid_written_n_pos_reviews)

386

In [14]:
# small_entire_df contains the details of the users (review + game reviewed genre/s info) that have written at least N positive reviews
small_entire_df = entire_df[entire_df.steamid.apply(lambda x : x in steamid_written_n_pos_reviews)]

# Select for each user which is the number of reviews of the more reviewed genre
num_more_reviewed_genre = small_entire_df.groupby('steamid').sum().iloc[:, 2:].max(axis=1)

In [15]:
# User fidelity measures how much a user is loyal to the more played genre
num_pos_reviews = small_entire_df.groupby('steamid').sum().voted_up
user_fidelity = num_more_reviewed_genre / num_pos_reviews * 100
user_fidelity

steamid
76561197960271994     75.000000
76561197960319772     83.333333
76561197960344344     80.000000
76561197960373660     62.500000
76561197960396581     75.000000
                        ...    
76561198410169156     75.000000
76561198822076089     60.000000
76561198888079295    100.000000
76561199005984205    100.000000
76561199095484177    100.000000
Length: 386, dtype: float64

#### Statistical test

In [16]:
# Specify the sample mean (x_bar), the sample standard deviation (s), the mean claimed in the null-hypothesis (mu_null), and the sample size (n)
x_bar = user_fidelity.mean()
s = user_fidelity.std()
mu = 70
n = len(user_fidelity)

# Calculate the test statistic
statistics = (x_bar - mu)/(s / math.sqrt(n))

# # Output the p-value of the test statistic (right tailed test)
p_value =  1 - stats.t.cdf(statistics , n - 1)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
# Level of significance
alpha = 0.05
# conclusion
if p_value < alpha:
    # users tend to be loyal to the same genre, no matter which genre
    print('\nReject the null hypothesis')
else:
    # opposite of above
    print('\nDo not reject the null hypothesis')


Mean test coefficient: 8.260 
P-value: 0.000

Reject the null hypothesis


## Big Data analysis