# Basic Exploratory Analysis and Visualisation

In [1]:
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from database_utils import get_query_results

In [2]:
from IPython.display import Image
from IPython.core.display import HTML 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Summary table

The basic analysis was performed by Peter Lipp using the created database and SQL queries. Below is the summary table that was used for reference for further visualisations: 

In [3]:
Image(url= "goal_view.jpg")

Get relevant data: 
- counts of books by club and genre  
- counts of boks by book club (so that we can caluclate percentages  

In [7]:
q = """
SELECT bc.club_name, g.name, count(*) cnt
FROM book b
JOIN genre_book gb ON b.book_id = gb.book_id
RIGHT JOIN genre g ON gb.genre_id = g.genre_id
JOIN book_club_book bcb ON bcb.book_id = b.book_id
JOIN book_club bc ON bc.book_club_id = bcb.book_club_id
GROUP BY bc.club_name, g.name 
"""
q2 = """
SELECT bc.club_name, count(*) cnt
FROM book b
JOIN book_club_book bcb ON bcb.book_id = b.book_id
JOIN book_club bc ON bc.book_club_id = bcb.book_club_id
GROUP BY bc.club_name
ORDER BY bc.club_name
"""
by_genre = get_query_results(q)
counts = get_query_results(q2)

Now we need to write a function that will calculate percentages for arbitrary genres we select and that should account for the fact that sometimes there are 0 book for bookclub-genre pair. 

In [16]:
def get_percentages(genre_name: str, by_genre: pd.DataFrame, counts: pd.DataFrame) -> pd.Series:
    cndf = pd.DataFrame({'club_name':club_names})
    single_genre = by_genre[by_genre.name == genre_name]
    return np.round(cndf.merge(single_genre, how='left').fillna(0).sort_values(by='club_name').cnt/counts.cnt*100)
get_list('non-fiction', by_genre,counts)

0    31.0
1     0.0
2    19.0
3    20.0
Name: cnt, dtype: float64

Also let's prepare a list of book clubs so that we have it at hand all the time:

In [17]:
club_names = list(get_query_results("SELECT * FROM book_club").club_name)
club_names

['Bertelsmann Data Science book readers',
 'Gone with a Book',
 "Pop Sugar's Annual Ultimate Reading Challenge",
 'Reading with Style']

## Visualisations 

### Fiction vs Non-fiction

The first visualisation we can make here is fiction vs non-fiction books across different book clubs. 

In [25]:
genres_list = ['fiction', 'non-fiction']
data = [go.Bar(
        x=club_names,
        y=get_list(genre, by_genre,counts),
        name=genre) for genre in genres_list]
layout = go.Layout(
    margin = dict(b=150),
    xaxis = dict(tickangle = 25),
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar',config={'showLink': False})
# show image result because Github doesn't render plotly figures 
Image(url='fiction_nonfiction.png')

### Classics, Fantasy, History, Science Fiction and Romance

Next, let's visualise the second section of the table for 5 genres: 
- classis
- fantasy
- history 
- science-fiction
- romance

In [34]:
genres_list = ['classics', 'fantasy', 'history', 'science-fiction', 'romance']
data = [go.Bar(
        x=club_names,
        y=get_list(genre, by_genre,counts),
        name=genre) for genre in genres_list]
layout = go.Layout(
    margin = dict(b=150),
    xaxis = dict(tickangle = 25),    
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar',config={'showLink': False})

# show image result because Github doesn't render plotly figures 
Image(url='five_genres.png')

## Bertelsmann vs other groups

Now let's try to summarize everything on one plot. First, we will select all the genres used in the summary table: 

In [41]:
genres_list = ['fiction', 'non-fiction','classics','fantasy', 'history', 'science-fiction', 'romance',
               'adult', 'young-adult','personal-development', 'philosophy']

Create 2 subsets for each of the data frame by separating Bertelsmann records from non-Bertelsmann: 

In [42]:
selvec = 'Bertelsmann Data Science book readers'
bert = by_genre[by_genre.club_name == selvec][['name','cnt']]
nonbert = by_genre[by_genre.club_name != selvec].groupby(by='name').sum().reset_index()
counts_bert = counts[counts.club_name == selvec].cnt.sum()
counts_nonbert = counts[counts.club_name != selvec].cnt.sum()

Now we can use these subsets to calculate percentages from the corresponding totals:

In [70]:
genres = pd.DataFrame({'name': genres_list})
bert_df = pd.merge(genres, bert[bert.name.apply(lambda x: x in genres_list)]
                                       .sort_values(by='name'),how='left').fillna(0)
nonbert_df = pd.merge(genres, nonbert[nonbert.name.apply(lambda x: x in genres_list)]
                                  .sort_values(by='name'), how='left').fillna(0)
bert_df['perc'] = np.round((bert_df.cnt/counts_bert)*100)
nonbert_df['perc'] = np.round((nonbert_df.cnt/counts_nonbert)*100)

And we can finally plot that: 

In [71]:
clubs = ['Bertelsmann DS book club', 'Other clubs']
genres_list.sort()
trace1 = go.Bar(
    x=genres_list,
    y=list(bert_list),
    name=clubs[0])
trace2 = go.Bar(
    x=genres_list,
    y=list(nonbert_list),
    name=clubs[1])
data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar',config={'showLink': False})

# show image result because Github doesn't render plotly figures 
Image(url='final_barplot.png')