In [1]:
import numpy as np
import math
from scipy.stats import chisquare
from collections import Counter
import DataProcessing as dp

In [2]:
# Load the Readers data set
from GR_Data import User

users_data = 'C:/docs/goodreads/data/users.txt'
ratings_data = 'C:/docs/goodreads/data/ratings.txt'

readers = User.load_users(users_data, ratings_data)
print('Number of Readers in data set: ' + str(len(readers)))

Number of Readers in data set: 1065


In [3]:
# Generate a list of the distinct books rated across readers

library = set()

for reader in readers:
    for title in reader.books:
        library.add(title[0])
        
# Change Fiction to General Fiction
for title in library:
    if title.genre == 'Fiction':
        title.genre = 'General Fiction'

In [4]:
# Calculate the standard error of Reader genre consumption estimation
se_matrix = []

for reader in readers:
    se = []
    n = len(reader.samples)
    for genre in dp.genres:
        p = reader.fiction_split_s[genre]
        se.append(math.sqrt( (p * (1 - p ) ) / n ))
    se_matrix.append(se)
se_matrix = np.array(se_matrix)
standard_error = se_matrix.mean(axis=0)
standard_deviation = se_matrix.std(axis=0)
print(standard_error)
print(standard_deviation)

[ 0.02907171  0.02128624  0.03024064  0.0198772   0.01209605  0.02503477
  0.02507449]
[ 0.01430627  0.01297633  0.01219064  0.01287864  0.01086629  0.01291104
  0.01283782]


In [9]:
print('mean standard error: ' + str(standard_error.mean()))
print('mean standard error deviation ' + str(standard_deviation.mean()))

mean standard error: 0.0232401577065
mean standard error deviation 0.0127095740504


### Significanc testing for differences in genre consumption within groups

__null hypothesis:__ Readers within each group randomly select genres to read. The observed frequencies reflect natural sampling fluctation.

__alternative hypothesis:__ Readers are not randomly selecting genres to read. Readers are showing genre preference.
               


In [33]:
# Chi square test

# Get a list of readers by genre preference
readers_by_genre = dp.div_by_genre(readers)

# Tally the observed frequencies within each group
genre_frequency = {}

for genre in genres:
    total_counts = Counter()
    for reader in readers_by_genre[genre]:
        total_counts += reader.genre_split_s
    genre_frequency[genre] = np.array([total_counts[genre] for genre in genres], dtype=int)
    
# Tally the total number of titles rated by each group
group_totals = {}

for genre in genres:
    # Subtract titles with unknown genre
    group_totals[genre] = sum(genre_frequency[genre])

# Calculate the expected values of each group given the null hypothesis

library_cnt = Counter() 
for title in library:
    library_cnt[title.genre] += 1
    
lib_proportions = []
lib_total = sum(library_cnt.values()) - library_cnt['none']

for genre in genres:
    lib_proportions.append( library_cnt[genre] / lib_total )
lib_proportions = np.array(lib_proportions)

expected_values = {}

for genre in genres:
    expected_values[genre] = np.array(lib_proportions * group_totals[genre], dtype=int)
    
# Chi square test

results = {}

for genre in genres:
    results[genre] = chisquare(genre_frequency[genre], f_exp=expected_values[genre])

print(results)

{'General Fiction': Power_divergenceResult(statistic=126.97569155269439, pvalue=5.5668442796046592e-25), 'Romance': Power_divergenceResult(statistic=97.951143426381989, pvalue=6.7117976737344926e-19), 'Fantasy': Power_divergenceResult(statistic=107.72128751302724, pvalue=6.113018397861703e-21), 'Science Fiction': Power_divergenceResult(statistic=125.0967580045569, pvalue=1.3831543077918578e-24), 'Horror': Power_divergenceResult(statistic=51.452148816976155, pvalue=2.4030111360643686e-09), 'Mystery': Power_divergenceResult(statistic=74.645827860117464, pvalue=4.539908435456227e-14), 'Young Adult': Power_divergenceResult(statistic=124.86646934095162, pvalue=1.5463305440250079e-24)}
