In [43]:
# this code will hide the coding cells..
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [150]:
# import 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import HTML
from scipy import stats
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import datetime as dt
df = pd.read_csv('c:/users/nanot/webscraping/scrapy/jre/data_clean/jre2_clean.csv')

In [151]:
# a few cleaning bits..
# change the name of the tag, 'atheletes-fighters-martial-arts' to something else...
df['tag'] = df.tag.str.replace('athletes-fighters-martial-arts', 'athletes-fighters')
# get rid of the episode number from the title
df['ep_title'] = df.ep_title.str.replace('\\#\d{1,4}- ', '')
# drop 'best of' episodes, episodes less than 55 min, and fight companion episodes..
mask1 = ~df.ep_title.str.contains('Best of')
mask2 = df.runtime > 55
mask3 = ~df.ep_title.str.contains('Fight Companion')
df = df[mask1&mask2&mask3]

# change the date format using pd.to_datetime(), and then set the airdate as index.
df['airdate'] = pd.to_datetime(df.airdate)
df = df.set_index('airdate')

# create two separate data frames: 1 with no tag column, and one with tag column.  Then drop the duplicates from the dataframe
dft = df.copy()
df = df.drop('tag', axis = 1)
df = df.drop_duplicates()



In [165]:
# define masks for the data
# masks for the outliers of tagless df
s = 3
mask_outliers_views = (np.abs(stats.zscore(df.views)) > s)
mask_outliers_runtime = (np.abs(stats.zscore(df.runtime)) > s)
mask_outliers_likes = (np.abs(stats.zscore(df.likes)) > s)
mask_outliers_dislikes = (np.abs(stats.zscore(df.dislikes)) > s)
mask_outliers_ratio = (np.abs(stats.zscore(df.ratio)) > s)
# masks for the outliers of df with tags
tmask_outliers_views = (np.abs(stats.zscore(dft.views)) > s)
tmask_outliers_runtime = (np.abs(stats.zscore(dft.runtime)) > s)
tmask_outliers_likes = (np.abs(stats.zscore(dft.likes)) > s)
tmask_outliers_dislikes = (np.abs(stats.zscore(dft.dislikes)) > s)
tmask_outliers_ratio = (np.abs(stats.zscore(dft.ratio)) > s)
# combos for both dfs
mask_or = ~mask_outliers_views & ~mask_outliers_likes & ~mask_outliers_dislikes & ~mask_outliers_ratio # outlier removal mask
tmask_or = ~tmask_outliers_views & ~tmask_outliers_likes & ~tmask_outliers_dislikes & ~tmask_outliers_ratio # outlier removal mask for tag df

#create time slice for last 2 years..
#dft_2ys = dft['2017-06':'2019-06']

df_or = df[mask_or]

In [281]:
# functions

# switch case function to choose year.
def slide_switch(x):
    if x == 8:
        return '2019-07-04'
    switcher = {
        1: '2013',
        2: '2014',
        3: '2015',
        4: '2016',
        5: '2017',
        6: '2018',
        7: '2019'
        }
    return switcher[x]

# take upper and lower bound for a year and return a list of tags which have more than 30 observations
def tags_with_thirty(yr_lower, yr_upper):
    return pd.DataFrame(dft[yr_lower:yr_upper].groupby('tag').ep_title.count()[dft[yr_lower:yr_upper].groupby('tag').ep_title.count() >= 30]).reset_index().tag.tolist()


Analysis of The Joe Rogan Experience Podcast

Intro:
The Joe Rogan Experience is a popular and unusually structured podcast in which guests sit and casually converse with its host, Joe Rogan.  Being that the show takes on different flavor depending on the guest, I thought it may be interesting to see if there are any obvious statistical differences amongst the many categories that may describe each episode.  But first, I wanted to take a general look at the data collected.

Dataset:
The dataset was scraped from JREpodcast.com--a third-party fan-made website with video links to every episode.  Nearly all entries included data such as title (which includes the guest), duration, number of views, number of likes, number of dislikes, and ratio.  This information wasn't too hard to scrape, but what I really wanted was the category, which proved to be more chanllenging to obtain.  When the scrape was complete, I had two datasets to clean: one with categories and one without.

Results

In [153]:
df.sample(10)

Unnamed: 0_level_0,ep,ep_title,runtime,views,likes,dislikes,ratio
airdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-06-06,67.0,Kevin Lee,126.95,953635,9756,1024,9.53
2018-02-12,1077.0,Johann Hari,152.42,1029507,14081,1611,8.74
2015-01-08,596.0,Brian Stann,175.67,350196,3424,152,22.53
2017-07-26,988.0,Nick Swardson,156.2,1926812,15580,1626,9.58
2015-07-29,675.0,Kirik Jenness & Chris Palmquist,167.5,162572,1221,86,14.2
2015-05-05,643.0,“Big” Jay Oakerson,178.67,277785,2018,140,14.41
2019-04-30,1286.0,Anthony Jeselnik,131.37,1818364,17576,1709,10.28
2019-02-20,1249.0,Donnell Rawlings,164.1,1249225,17408,1143,15.23
2019-05-14,1295.0,Tulsi Gabbard,154.8,2085641,48972,5773,8.48
2013-01-21,115.0,"Maz Jobrani, Brian Redban",130.3,10364,137,6,22.83


In [154]:
dft.sample(10)

Unnamed: 0_level_0,ep,ep_title,tag,runtime,views,likes,dislikes,ratio
airdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-05-23,28.0,Georges St-Pierre,athletes-fighters,182.37,3000670,48448,1553,31.2
2018-02-09,1075.0,Duncan Trussell,comedians,208.3,2831470,27584,2278,12.11
2014-09-10,547.0,Joe DeRosa,writers,170.72,78523,637,567,1.12
2016-05-17,800.0,Bobcat Goldthwait,actors,147.92,737560,5939,444,13.38
2019-04-06,1278.0,Kevin Hart,comedians,124.87,6471237,133235,4631,28.77
2013-06-01,255.0,Duncan Trussell,comedians,180.02,92135,922,47,19.62
2016-03-08,770.0,Michael Shermer,writers,179.4,342352,2798,377,7.42
2015-08-07,679.0,Barry Crimmins & Bobcat Goldthwait,actors,92.37,144148,1462,76,19.24
2013-12-04,424.0,Brody Stevens,actors,161.5,151461,1361,114,11.94
2017-11-30,1046.0,Owen Smith,actors,118.02,545842,6098,453,13.46


In [166]:
# plot time series of views
# need to format the plot.  label y axis, and get more tick marks, maybe...
cols = ['views', 'likes', 'dislikes']
ylabels = ['Views (10 Mil)', 'Likes', 'Dislikes']
@interact
def ts(DropOutliers = False):
    if DropOutliers:
        dfs = df_or
    else:
        dfs = df
    axes = dfs[cols].plot(marker='.', title = 'Number of Views Since the Beginning of the Podcast', figsize=(15,10), subplots = True, legend = False, grid = True)
    # There are a few spikes that are interesting.  Who are these people receving so many views?
    plt.subplots_adjust(top=.95, left=.25)
    #plt.ylabel(['Dislikes', 'Likes', 'Views (10 Mil)'])
    i = 0
    for ax in axes:
        ax.set_ylabel(ylabels[i])
        ax.set_xlabel('Air Date')
        i += 1
    

interactive(children=(Checkbox(value=False, description='DropOutliers'), Output()), _dom_classes=('widget-inte…

In [156]:
#Histogram
col_list = ['views', 'likes', 'dislikes', 'ratio', 'runtime']
@interact
def histogram_by_col(DropOutliers = False, LastTwoYears = False, Columns=col_list):
    if LastTwoYears:
        start_date = '2017-06'
    else:
        start_date = '2013-01-15'
    if DropOutliers:
        dfthist = dft_or
    else:
        dfthist = dft 
    fs = 15
    axes = dfthist[start_date:'2019-07-04'][Columns].hist( color = 'c',figsize=(8,8))
    axes.set_ylabel('Count', fontsize = fs)
    axes.set_xlabel(Columns, fontsize = fs)
#     axes.set_title(Columns, fontsize=fs)
    

interactive(children=(Checkbox(value=False, description='DropOutliers'), Checkbox(value=False, description='La…

In [201]:
@interact
def ep_views(x=(1,7)):
    yr_lower = slide_switch(x)
    yr_upper = slide_switch(x+1)
    # attempting to get two plots side by side.
    fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(20,10))

    n = 10
    ep_count = dft[yr_lower:yr_upper].groupby('tag').ep_title.count().nlargest(n)
    avg_views = dft[yr_lower:yr_upper].groupby('tag').views.mean().nlargest(n)



    ax.axes.set_title('{} Top {} Number of Episodes by Category'.format(yr_lower,n), fontsize=fs+2)
    ax.axes.set_xlabel('Number of Episodes', fontsize = fs)

    viewlim = 18e5
    ax2.axes.set_title('{} Top {} Average Number of Views by Category'.format(yr_lower,n), fontsize=fs+2)
    #ax2.axes.set_xticks([viewlim/2, viewlim])
    #ax2.axes.set_xlim((min(avg_views), max(avg_views)))
    ax2.axes.ticklabel_format(axis='x', style='sci', scilimits=(6,6))
    ax2.axes.set_xlabel('Number of Views in Millions', fontsize=fs)



    ep_count.sort_values(ascending=True).plot(kind='barh', x='LABEL',  legend=False, ax=ax, fontsize = fs+5, color='b')
    avg_views.sort_values(ascending=True).plot(kind='barh', x='LABEL',ax=ax2, fontsize = fs+5, color='c')
    plt.tight_layout()
    plt.show()

interactive(children=(IntSlider(value=4, description='x', max=7, min=1), Output()), _dom_classes=('widget-inte…

In [222]:
# get a count of episodes per category
@interact
def ep_count_by_cat(x=(1,7)):
    
    yr_lower = slide_switch(x)
    yr_upper = slide_switch(x+1)
    fs = 15
    axes = dft[yr_lower:yr_upper].groupby('tag').ep_title.count().sort_values(ascending=True).plot.barh(color = 'c', fontsize = fs, figsize=(10,10))
    axes.set_ylabel('Category', fontsize = fs)
    axes.set_xlabel('Episode Count', fontsize = fs)
    axes.set_title('{} Number of Episodes by Category'.format(yr_lower), fontsize=fs)

interactive(children=(IntSlider(value=4, description='x', max=7, min=1), Output()), _dom_classes=('widget-inte…

In [176]:
# barplot 

@interact
def barplot_views_by_cat(DropOutliers = False, x=(1,7)):
    yr_lower = slide_switch(x)
    yr_upper = slide_switch(x+1)
    
    if DropOutliers:
        dftbar = dft_or
    else:
        dftbar = dft 
    fs = 15
    axes = dftbar[yr_lower:yr_upper].groupby('tag').views.mean().sort_values(ascending=True).plot.barh(color = 'c', fontsize = 15, figsize=(10,10))
    axes.set_ylabel('Guest', fontsize = fs)
    axes.set_xlabel('Views', fontsize = fs)
    axes.set_title('{} Views by Category'.format(yr_lower), fontsize=fs)
    

interactive(children=(Checkbox(value=False, description='DropOutliers'), IntSlider(value=4, description='x', m…

In [275]:



# I want to choose a column from col list and display box plots from specific tags on the same plot.
dft_or = dft[tmask_or]
# how do i select columns for this...?
col_list = ['views', 'likes', 'dislikes', 'ratio', 'runtime']


tag = ['politics', 'athletes-fighters', 'authors', 'comedians', 'filmmakers', 'health', 'journalists', 'miscellaneous', 'musicians', 'politics', 'writers']

@interact
def box_plots_by_col(column=col_list, WithOutliers=False, ShowFliers = False, LastTwoYears = False, FirstTwoYears = False):
    
    if FirstTwoYears:
        start_date='2013-01-15'
        end_date='2015-01'
        title_string = 'Distributions Within %s For First Two Years'%(column.capitalize())
    else:
        start_date='2013-01-15'
        end_date='2019-07-04'
        title_string = 'Distributions Within %s'%(column.capitalize())
    if LastTwoYears:
        start_date = '2017-06'
        end_date='2019-07-04'
        title_string = 'Distributions Within %s For Last Two Years'%(column.capitalize())
    else:
        start_date = '2013-01-15'
        end_date='2019-07-04'
        title_string = 'Distributions Within %s'%(column.capitalize())
    if WithOutliers:
        dftbox=dft
    else:
        dftbox=dft_or
    cat = tags_with_thirty(start_date,end_date)
        
    fs = 17
    axes = dftbox[dftbox.tag.isin(cat)][start_date:end_date].boxplot(showfliers = ShowFliers, by = 'tag', column=column, figsize=(20,7), fontsize = fs)
    axes.set_ylim(0,)
    axes.set_title(title_string, fontsize = fs)
    axes.set_ylabel('Count', fontsize = fs)
    axes.set_xlabel('Category', fontsize = fs)
    return axes



interactive(children=(Dropdown(description='column', options=('views', 'likes', 'dislikes', 'ratio', 'runtime'…

In [279]:
# compare boxplots by year.

# return list of tags which have 30 episodes or more in the count for that year interval:


@interact
def box_plots_by_col(column=col_list, WithOutliers=False, ShowFliers = False, x=(1,7)):
    yr_lower = slide_switch(x)
    yr_upper = slide_switch(x+1)
    cat = tags_with_thirty(yr_lower, yr_upper)
    
    
    if WithOutliers:
        dftbox=dft
    else:
        dftbox=dft_or
        
    fs = 17
    axes = dftbox[dftbox.tag.isin(cat)][yr_lower:yr_upper].boxplot(showfliers = ShowFliers, by = 'tag', column=column, figsize=(20,7), fontsize = fs)
    axes.set_ylim(0,)
    axes.set_title('{} Distributions Within %s'.format(yr_lower)%(column.capitalize()), fontsize = fs)
    axes.set_ylabel('Count', fontsize = fs)
    axes.set_xlabel('Category', fontsize = fs)
    return axes

interactive(children=(Dropdown(description='column', options=('views', 'likes', 'dislikes', 'ratio', 'runtime'…

In [162]:
dft_or = dft[tmask_or]
tag = ['politics', 'athletes-fighters', 'authors', 'comedians', 'filmmakers', 'health', 'journalists', 'miscellaneous', 'musicians', 'politics', 'writers']


@interact

def box_plots(category=dft_or.tag.unique().tolist(), WithOutliers=False, ShowFliers = False):
    if WithOutliers:
        dftbox=dft
    else:
        dftbox=dft_or
    tag = category
    
    the_plot = dftbox[dftbox.tag.isin([tag])].plot(showfliers = ShowFliers, title = 'Distributions Within %s'%(tag.capitalize()), fontsize=15,  y = col_list, kind='box', layout = (2,3), subplots = True, figsize = (20,8))
    return the_plot

interactive(children=(Dropdown(description='category', options=('writers', 'musicians', 'politics', 'models', …

In [167]:
#set up interact to get top views..
col_list = ['views', 'likes', 'dislikes', 'ratio', 'runtime']
@interact
def horz_bar_plot(column = col_list, n=(1, 50), DropOutliers = False):
    if DropOutliers:
        dfbar = df_or
    else:
        dfbar = df 
    
    if n > 30:
        fs = n/2
    else:
        fs = 12
    axes = dfbar.nlargest(n, column).groupby('ep_title')[column].max().sort_values(ascending=True).plot.barh(fontsize=fs, color='b', figsize=(5,fs/2))
    axes.set_ylabel('Guest', fontsize = fs)
    axes.set_xlabel(column.capitalize(), fontsize = fs)
    axes.set_title('Top {} Guests by Number of {}'.format(n, column.capitalize()), fontsize=fs)
    

interactive(children=(Dropdown(description='column', options=('views', 'likes', 'dislikes', 'ratio', 'runtime'…

Further work:
-Explore transformation techniques, especially when dealing with the left-skewed plots.
-Consolidate some of the categories.
-Add visualization to illustrate the increase in podcast listeners since the beginning of JRE
-Explore transcripts for nlp
-explore the sentiment of extreme values of likes or dislikes over social media comments
-implement stat test to test amongst variables