## Testing hypothesis 6

---

**_Hypothesis_**: The larger the number of books published for a category, the higher the review score. (marketing strategy, the publishers tend to publish books of the most liked category). The larger the number of books published by publishers, the higher the review score (books published by the most famous publishers are preferred)

   - **Metric**: correlation coefficients, Kolmogorov-Smirnov, Chi-Square, Wilcoxon (for future normalization)

**Missing Values**:

  - `publisher`: remove the entire sample
  - `review/score`: remove the entire sample
  - `categories`: remove the entire sample

**Data Transformation**:

  - `categories`: GroupBy categories.
  - `publisher`: GroupBy publisher.
  - `review/score`: Compute the average review/score for each publisher and category.

---

In [None]:
# Connect to MongoDB

import pymongo

client = pymongo.MongoClient('mongodb://localhost:27017/')
database = client['spark_db']
books = database['books_joined']

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Analysis of `categories`

In [None]:
# Deal with missing values
pipeline_missing = {'$match': {
    'review/score': {'$exists': True, '$ne': 0.0},
    'publisher': {'$exists': True, '$ne': None},
    'categories': {'$exists': True},
}
}

# Compute average rating and number of reviews for each category
pipeline_average_rating = {'$group': {
    '_id': '$categories',
    'avg_score': {'$avg': '$review/score'},
    'count': {'$sum': 1},
    'titles': {'$addToSet': '$Title'}
}
}

# Compute number of different titles for each category
pipeline_project = {'$project': {
    '_id': 0,
    'category': '$_id',
    'avg_score': 1,
    'count': 1,
    'num_titles': {'$size': '$titles'}
}
}

# Remove categories with less than 100 reviews
pipeline_remove = {'$match': {
    'count': {'$gt': 1}
}
}

results_2 = books.aggregate(
    [pipeline_missing, pipeline_average_rating, pipeline_project, pipeline_remove])

df_results_2_category = pd.DataFrame(list(results_2))
df_results_2_category['avg_score_weighted'] = df_results_2_category.avg_score * \
    np.sqrt(df_results_2_category.num_titles)
    
df_results_2_category=df_results_2_category[df_results_2_category['num_titles']>50]

In [None]:
df_results_2_category.sort_values(by='num_titles', ascending = False).head()    

In [None]:
# Analyze the top 20 categories with the highest average rating
df_results_2_category.sort_values(
    by='avg_score_weighted', ascending=False, inplace=True)
df_results_2_category.head(20).plot(x='category', y='avg_score_weighted', kind='bar', figsize=(
    15, 5), xlabel='category', title='Average rating for each category * sqrt(num different titles)', legend=False)

# Analyze the top 20 categories with the highest number of reviews
df_results_2_category.sort_values(by='count', ascending=False, inplace=True)
df_results_2_category.head(20).plot(x='category', y='count', kind='bar', figsize=(
    15, 5), xlabel='category', title='Number of reviews for each category', legend=False)

### Analysis of `publishers`

In [None]:
# Deal with missing values
pipeline_missing = {'$match': {
    'review/score': {'$exists': True, '$ne': 0.0},
    'publisher': {'$exists': True, '$ne': None},
    'categories': {'$exists': True},
}
}

# Compute average rating for each publisher
pipeline_average_rating = {'$group': {
    '_id': '$publisher',
    'avg_score': {'$avg': '$review/score'},
    'count': {'$sum': 1},
    'titles': {'$addToSet': '$Title'}
}
}

# Compute number of different titles for each publisher
pipeline_project = {'$project': {
    '_id': 0,
    'publisher': '$_id',
    'avg_score': 1,
    'count': 1,
    'num_titles': {'$size': '$titles'}
}
}

# Remove publisher with less than 100 reviews
pipeline_remove = {'$match': {
    'count': {'$gt': 1}
}
}

results_2 = books.aggregate(
    [pipeline_missing, pipeline_average_rating, pipeline_project, pipeline_remove])

df_results_2_publisher = pd.DataFrame(list(results_2))
df_results_2_publisher['avg_score_weighted'] = df_results_2_publisher.avg_score * np.sqrt(df_results_2_publisher.num_titles)
df_results_2_publisher=df_results_2_publisher[df_results_2_publisher['num_titles']>20]

In [None]:
df_results_2_publisher.head()

In [None]:
# Analyze the top 20 publishers with the highest average rating
df_results_2_publisher.sort_values(
    by='avg_score_weighted', ascending=False, inplace=True)
df_results_2_publisher.head(20).plot(x='publisher', y='avg_score_weighted', kind='bar', figsize=(
    15, 5), xlabel='publisher', title='Average rating for each publisher * sqrt(num different titles)', legend=False)

# Analyze the top 20 publishers with the highest number of reviews
df_results_2_publisher.sort_values(by='count', ascending=False, inplace=True)
df_results_2_publisher.head(20).plot(x='publisher', y='count', kind='bar', figsize=(
    15, 5), xlabel='publisher', title='Number of reviews for each publisher', legend=False)

## **Correlation** between `avg_rating` and number of `different titles` published

In [None]:
# avg_score for category respect to number of different titles

# Get rid of outliers
df_results_2_category_filtered = df_results_2_category[df_results_2_category.num_titles < 2000]
df_results_2_category_filtered.plot(x='num_titles', y='avg_score', kind='scatter', figsize=(
    15, 5), xlabel='num_titles', title='Average rating for each category respect to number of different titles', legend=False)

In [None]:
from scipy_analyze import *
scipy_analize(df_results_2_category['num_titles'], 'num_titles', df_results_2_category['avg_score'], 'avg_score', [1,2,3,4,5])

In [None]:
# avg_score for publisher respect to number of different titles
# Get rid of outliers
df_results_2_publisher_filtered = df_results_2_publisher[df_results_2_publisher.num_titles < 1100]
df_results_2_publisher_filtered.plot(x='num_titles', y='avg_score', kind='scatter', figsize=(
    15, 5), xlabel='num_titles', title='Average rating for each publisher respect to number of different titles', legend=False)

In [None]:
from scipy_analyze import *
scipy_analize(df_results_2_publisher['num_titles'], 'num_titles', df_results_2_publisher['avg_score'], 'avg_score', [1,2,3,4,5])

## **Conclusions**
#### There is no correlation between the number of books published and the average review score, neither for categories nor for publishers.

---

## Further analysis: which are the best publishers?

#### Best = capable of getting very high ratings on their books, in a large number of categories.

In [None]:
# Deal with missing values
pipeline_missing = {'$match': {
    'review/score': {'$exists': True, '$ne': 0},
    'publisher': {'$exists': True, '$ne': None},
    'categories': {'$exists': True},
}
}

# Compute average rating for each tuple category, publisher
pipeline_average_rating = {'$group': {
    '_id': {
        'category': '$categories',
        'publisher': '$publisher',
    },
    'avg_score': {'$avg': '$review/score'},
    'count': {'$sum': 1}
}
}

# Show average rating for category for each publisher
pipeline_publisher = {'$group': {
    '_id': '$_id.publisher',
    'avg_score/category': {
        '$push': {
            'category': '$_id.category',
            'avg_score': '$avg_score',
            'count': '$count'
        }
    }
}
}

# Unwind the list of categories
pipeline_unwind = {'$unwind': '$avg_score/category'}

# Remove categories or publisher with less than 'threshold' reviews
threshold = 0
pipeline_remove = {'$match': {
    'avg_score/category.count': {'$gte': threshold}
}
}

# Count the number of categories with average rating > 4.5
pipeline_counts = {'$project': {
    'publisher': '$_id',
    '_id': 0,
    'category': '$avg_score/category.category',
    'count': {
        '$sum': {
            '$cond': {

                'if': {'$gt': ['$avg_score/category.avg_score', 4.5]},
                'then': 1,
                'else': 0
            }
        }
    }
}
}

# Sum the results for each publisher. If Total > 10, then the hypothesis is False
pipeline_sum = {'$group': {
    '_id': '$publisher',
    'total': {'$sum': '$count'}
}
}

pipeline_sort = {'$sort': {
    'total': -1
}
}

results = books.aggregate([pipeline_missing, pipeline_average_rating, pipeline_publisher,
                          pipeline_unwind, pipeline_remove, pipeline_counts, pipeline_sum, pipeline_sort])

df_results = pd.DataFrame(list(results))

In [None]:
df_results.head()

In [None]:
df_results.head(20).plot(x='_id', y='total', kind='bar', figsize=(15, 5), xlabel='publisher',
                         title='Number of categories with average rating > 4.5 for each publisher', legend=False)

---

## Further analysis: In which category there are more best publishers?

#### Best = capable of getting very high ratings on their books.

In [None]:
# Deal with missing values
pipeline_missing = {'$match': {
    'review/score': {'$exists': True, '$ne': 0},
    'publisher': {'$exists': True, '$ne': None},
    'categories': {'$exists': True},
}
}

# Compute average rating for each tuple category, publisher
pipeline_average_rating = {'$group': {
    '_id': {
        'category': '$categories',
        'publisher': '$publisher',
    },
    'avg_score': {'$avg': '$review/score'},
    'count': {'$sum': 1}
}
}

# Show average rating for category for each publisher
pipeline_publisher = {'$group': {
    '_id': '$_id.category',
    'avg_score/publisher': {
        '$push': {
            'publisher': '$_id.publisher',
            'avg_score': '$avg_score',
            'count': '$count'
        }
    }
}
}

# Unwind the list of categories
pipeline_unwind = {'$unwind': '$avg_score/publisher'}

# Remove categories or publisher with less than 'threshold' reviews
threshold = 0
pipeline_remove = {'$match': {
    'avg_score/publisher.count': {'$gte': threshold}
}
}

# Count the number of categories with average rating > 4.5
pipeline_counts = {'$project': {
    'category': '$_id',
    '_id': 0,
    'publisher': '$avg_score/publisher.publisher',
    'count': {
        '$sum': {
            '$cond': {

                'if': {'$gt': ['$avg_score/publisher.avg_score', 4.5]},
                'then': 1,
                'else': 0
            }
        }
    }
}
}

# Sum the results for each publisher. If Total > 10, then the hypothesis is False
pipeline_sum = {'$group': {
    '_id': '$category',
    'total': {'$sum': '$count'}
}
}

pipeline_sort = {'$sort': {
    'total': -1
}
}

results = books.aggregate([pipeline_missing, pipeline_average_rating, pipeline_publisher,
                          pipeline_unwind, pipeline_remove, pipeline_counts, pipeline_sum, pipeline_sort])

df_results = pd.DataFrame(list(results))

In [None]:
df_results.head()

In [None]:
df_results.head(20).plot(x='_id', y='total', kind='bar', figsize=(15, 5), xlabel='category',
                         title='Number of publishers with average rating > 4.5 for each category', legend=False)