# Raw fields descriptive stats for 2017 data  

#### Notebook objectives:  
- High-level review of the data, enough to understand which fields could be useful for analyses  
- Create HTML docs with basic descriptive stats and frequency plots for the raw fields, to be used as reference  
  - Note that this dataset is too large to easily use tools like ydata-profiling, so we're creating a much more lightweight version with just the necessary basics.  

#### Steps:  
1. [Load data](#Load-data)  
2. [Descriptive stats of raw fields](#Descriptive-stats-of-raw-fields)  
   - [object fields](#Object-fields)  
   - [non-object fields](#Non-object-fields)  
   
   

In [1]:
import pickle
import pandas as pd
import chime

from IPython.display import display, HTML
import ipywidgets as widgets
import plotly.graph_objects as go
import plotly

In [2]:
chime.theme('zelda')

In [3]:
DATA_PATH = 'data/'
OUTPUT_PATH = 'output/'

## Load data

Load the 2017 data.

In [4]:
with open(DATA_PATH + 'reddit_askdocs_submissions_2017.pkl', 'rb') as f:
    d_2017 = pickle.load(f)

In [5]:
len(d_2017)

62438

In [6]:
df = pd.DataFrame(d_2017)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62438 entries, 0 to 62437
Data columns (total 56 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   62438 non-null  object 
 1   author_flair_css_class   43394 non-null  object 
 2   author_flair_text        43394 non-null  object 
 3   brand_safe               62438 non-null  bool   
 4   can_mod_post             35267 non-null  object 
 5   contest_mode             62438 non-null  bool   
 6   created_utc              62438 non-null  int64  
 7   domain                   62438 non-null  object 
 8   full_link                62438 non-null  object 
 9   id                       62438 non-null  object 
 10  is_crosspostable         20127 non-null  object 
 11  is_reddit_media_domain   14224 non-null  object 
 12  is_self                  62438 non-null  bool   
 13  is_video                 40593 non-null  object 
 14  locked                

In [8]:
df.head()

Unnamed: 0,author,author_flair_css_class,author_flair_text,brand_safe,can_mod_post,contest_mode,created_utc,domain,full_link,id,...,approved_at_utc,banned_at_utc,view_count,gilded,media_embed,secure_media_embed,author_created_utc,author_fullname,media,secure_media
0,[deleted],,,True,False,False,1514764452,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbw...,7nbwtn,...,,,,,,,,,,
1,XenonCSGO,default,This user has not yet been verified.,True,False,False,1514764122,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvsv,...,,,,,,,,,,
2,[deleted],,,True,False,False,1514764055,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvln,...,,,,,,,,,,
3,DavisTheMagicSheep,default,This user has not yet been verified.,True,False,False,1514763799,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbu...,7nburb,...,,,,,,,,,,
4,Dontgetscooped,default,This user has not yet been verified.,True,False,False,1514763188,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbs...,7nbsw2,...,,,,,,,,,,


## Descriptive stats of raw fields  

In [9]:
df.columns.to_list()

['author',
 'author_flair_css_class',
 'author_flair_text',
 'brand_safe',
 'can_mod_post',
 'contest_mode',
 'created_utc',
 'domain',
 'full_link',
 'id',
 'is_crosspostable',
 'is_reddit_media_domain',
 'is_self',
 'is_video',
 'locked',
 'num_comments',
 'num_crossposts',
 'over_18',
 'parent_whitelist_status',
 'permalink',
 'pinned',
 'retrieved_on',
 'score',
 'selftext',
 'spoiler',
 'stickied',
 'subreddit',
 'subreddit_id',
 'subreddit_type',
 'thumbnail',
 'title',
 'url',
 'whitelist_status',
 'created',
 'post_hint',
 'preview',
 'banned_by',
 'edited',
 'crosspost_parent',
 'crosspost_parent_list',
 'thumbnail_height',
 'thumbnail_width',
 'author_cakeday',
 'distinguished',
 'removal_reason',
 'suggested_sort',
 'approved_at_utc',
 'banned_at_utc',
 'view_count',
 'gilded',
 'media_embed',
 'secure_media_embed',
 'author_created_utc',
 'author_fullname',
 'media',
 'secure_media']

In [10]:
len(list(df.columns))

56

There are 56 raw fields in our data.  

Let's output some descriptive statistics to HTML, to use as reference.

In [11]:
def obj_freq_plot(df, field, n=20, abbr_charnum=40, color=''):
    '''Makes a plotly horizontal bar chart plot for the selected object type field,
    truncated to n top values.'''
    freq_df = df[field].value_counts(dropna=False, normalize=True).head(n).reset_index()
    freq_df.columns = ['value', 'pct']
    freq_df['value'].fillna('[None]', inplace=True)
    freq_df['value'] = freq_df['value'].astype(str) # to prevent plotly bugs
    freq_df['value_abbr'] = freq_df['value'].str[:abbr_charnum]
    freq_df_dedup = freq_df.drop_duplicates('value_abbr')
    freq_df_dup = freq_df[freq_df.duplicated('value_abbr')]
    
    # plotly
    traces = []
    
    if len(freq_df_dup) > 0:
        trace = go.Bar(
            x=freq_df_dup['pct'],
            y=freq_df_dup['value_abbr'],
            orientation='h',
            hovertext=freq_df_dup['value'],
            hovertemplate="Value: %{hovertext}<br>" +
                "Pct: %{x:,.2f}%<br>" +
                "<extra></extra>",
        )
        traces.append(trace)
        
    trace = go.Bar(
        x=freq_df_dedup['pct'],
        y=freq_df_dedup['value_abbr'],
        orientation='h',
        hovertext=freq_df_dedup['value'],
        hovertemplate="Value: %{hovertext}<br>" +
            "Pct: %{x:,.2f}%<br>" +
            "<extra></extra>",
    )
    traces.append(trace)

    fig = go.Figure(traces)
    
    if len(color) > 0:
        fig.update_traces(marker_color=color)
        
    fig.update_yaxes(categoryorder='total ascending')
    fig.update_xaxes(gridcolor='#eee', title='% frequency')
    fig.update_layout(
        title=f'<b>{field}</b> frequency distribution (top {n})',
        plot_bgcolor='#fff',
        barmode='group',
        showlegend=False,
        height=400
    )
    
    if len(freq_df) >= 10:
        fig.update_layout(height=500)

    return fig

In [12]:
def num_freq_plot(df, field, color=''):
    '''Makes a plotly bar chart plot for the selected num type field.'''
    freq_df = df[field].value_counts().sort_index().reset_index()
    freq_df.columns = ['value', 'count']
    
    # plotly
    traces = []
        
    trace = go.Bar(
        x=freq_df['value'],
        y=freq_df['count'],
        hovertext=freq_df['value'],
        hovertemplate="Value: %{x:,}<br>" +
            "Frequency: %{y:,}<br>" +
            "<extra></extra>",
    )
    traces.append(trace)

    fig = go.Figure(traces)
    
    if len(color) > 0:
        fig.update_traces(marker_color=color)
        
    fig.update_yaxes(gridcolor='#eee', title='frequency')
    fig.update_layout(
        title=f'<b>{field}</b> frequency distribution',
        plot_bgcolor='#fff',
        showlegend=False,
        height=400
    )
    
    if len(freq_df) >= 10:
        fig.update_layout(height=500)

    return fig

In [13]:
def univar_stats_HTML(df, field, nobs=20):
    '''Returns an HTML string with univariate stats and top nobs value counts 
    for the field, along with the full value counts dataframe.'''
    field_index = df.columns.to_list().index(field)
    field_num = field_index + 1
    
    field_dtype = df[field].dtypes
    
    # card 1 content: univar stats
    try:
        univar_stats_disp = pd.DataFrame(df[field].describe()).to_html()
    except Exception as exception:
        univar_stats_disp = f'''<p>Exception: {type(exception).__name__}
        <br>Exception message: {exception}</p>'''
                                            
    tmin = pd.to_datetime(df[(df[field].notna())]['created_utc'].min(), unit='s')
    tmax = pd.to_datetime(df[(df[field].notna())]['created_utc'].max(), unit='s')
    tmin_disp = f'<p>First used dt: <br>{tmin}</p>'
    tmax_disp = f'<p>Last used dt: <br>{tmax}</p>'
    

    # card 2 content: value counts table
    try:
        vc1 = df[field].value_counts(dropna=False)
        vc1.name = 'count'
        vc2 = df[field].value_counts(dropna=False, normalize=True)
        vc2.name = 'pct'
        vc = pd.concat([vc1, vc2], axis=1)
        vc_disp = vc.head(nobs).reset_index()\
            .to_html(index=False)
    except Exception as exception:
        vc_disp = f'''<p>Exception: {type(exception).__name__}
        <br>Exception message: {exception}</p>'''

        
    card_header_style = ('background-color:rgba(30, 144, 255, 1);color:white;'
                         + 'padding:1em;border-radius:8px 8px 0 0;')
    card_holder_style = ('background-color:rgba(30, 144, 255, 0.1);padding:1em;'
                         + 'border-radius:0 0 8px 8px;text-align: center;')
    card_style = (
        'display:inline-block;'
        + 'vertical-align:top;'
        + 'background-color:#fff;'
        + 'border-radius:8px;'
        + 'padding:2em;'
        + 'margin: 0 1em 1em 0;'
    )
    
    html_str = f'''
    <div style="{card_header_style}">
        <h2 id="{field}">{field}</h2>
        <h4>Field number in dataset: {field_num}</h4>
        <h4>Field data type: {field_dtype}</h4>
    </div>
    <div style="{card_holder_style}">
        <div style="{card_style}">
            <h4>Univariate stats</h4>
            {univar_stats_disp}
            {tmin_disp}
            {tmax_disp}
        </div>
        <div style="{card_style}">
            <h4>Value counts</h4>
            {vc_disp}
        </div>
    </div>'''
    
    return html_str, vc

In [14]:
def stats_html(fname, df, cols, report_title, nobs, plot_color):
    '''Write to HTML file the stats for the specified list of column names, 
    with value counts table truncated to nobs.'''

    style_tag = '''
    <style>
        body {font-family: Arial;}
        table {
            border: 0.5px solid lightgrey;
            border-collapse: collapse;
            text-align: right;
            border-color: #fff;
        }
        th {
            padding: 0.2em 1em;
        }
        td {
            padding: 0.2em 1em;
        }
        h1 {
            color:rgba(30, 144, 255, 1);
        }
        .toc {
            margin-bottom: 2em;
        }
        .stats-block {
            margin-bottom: 2em;
            border: 2px solid #1e90ff82;
            border-radius: 8px;
            padding: 4px;
        }
    </style>
    '''
    html_start_str = f'''
    <!DOCTYPE html>
    <html>
    <head>
        <title>{report_title}</title>
        {style_tag}
    </head>
    <body>
    '''
    toc_str = (
        f'<div class="toc"><h1>{report_title}</h1><ul>'
        + ' '.join([f'<li><a href="#{x}">{x}</a></li>' for x in cols])
        + '</ul></div>'
    )

    plotly_cdn_added = False

    with open(fname, 'w') as f:
        f.write(html_start_str)
        f.write(toc_str)
            
        display(HTML(f'<p>Writing stats for {len(cols)} fields.</p>'))
    
        for i, c in enumerate(cols):
            display(HTML(f'<p>{i}: {c}</p>'))
            univar_stats_html_str, vc = univar_stats_HTML(df, c, nobs=nobs)

            f.write('<div class="stats-block">')
            
            f.write(univar_stats_html_str)
            
            # freq plot using plotly
            plotly_skip_plot = None
            plotly_fig = None
            if len(vc) > 2:
                if df[c].dtypes == 'O':
                    # Mostly for checking if hashable type here, 
                    # only need this for object dtype.
                    try:
                        nuniq = df[c].nunique(dropna=False)
                    except Exception as exception:
                        plotly_skip_plot = f'''
                        <h4>Plotly data prep error for {c}:</h4>
                        <p>Exception: {type(exception).__name__}
                        <br>Exception message: {exception}</p>'''

                    if not plotly_skip_plot:
                        plotly_fig = obj_freq_plot(
                            df, c, n=nobs, abbr_charnum=40, color=plot_color)
                        
                elif df[c].dtypes == 'bool':
                    plotly_fig = obj_freq_plot(
                        df, c, n=nobs, abbr_charnum=40, color=plot_color)
                #elif df[c].dtypes == 'number':
                else:
                    plotly_fig = num_freq_plot(df, c, color=plot_color)
                # else:
                #     plotly_skip_plot = f'''
                #         <p>Skipping plotly chart for {c}, 
                #         dtype = {df[c].dtypes}.</p>'''
                #     display(HTML(plotly_skip_plot))
            else:
                plotly_skip_plot = f'''
                    <p>Skipping plotly chart for {c}, nunique = {len(vc)}.'''
                display(HTML(plotly_skip_plot))
                    
            if not plotly_skip_plot:
                if not plotly_cdn_added:
                    plotly_html = plotly.io.to_html(
                        plotly_fig, full_html=False, include_plotlyjs='cdn')
                    plotly_cdn_added = True
                else:
                    plotly_html = plotly.io.to_html(
                        plotly_fig, full_html=False, include_plotlyjs=False)
                f.write(plotly_html)
            else:
                f.write(plotly_skip_plot)
            f.write('</div>')
    
        f.write(f'</body>')

    
    print('=== DONE! ====')
    chime.success()

### Object fields  

In [15]:
object_fields = df.select_dtypes(include='O').columns.to_list()

Number of object fields in our dataset:

In [16]:
len(object_fields)

34

In [17]:
nobs = 20 # number of top value counts records to display

In [18]:
# Creates HTML doc with descriptive stats of the raw object fields
stats_html(OUTPUT_PATH + 'raw_descr_stats_2017_obj.html', 
           df, 
           object_fields, 
           'Descriptive stats for raw object fields, 2017',
           nobs,
           '#47d65f'
          )

=== DONE! ====


### Non-object fields

Number of non-object fields in our dataset:

In [19]:
non_object_fields = df.columns[~df.columns.isin(object_fields)].to_list()
len(non_object_fields)

22

In [20]:
# Creates HTML doc with descriptive stats of the raw non-object fields
stats_html(OUTPUT_PATH + 'raw_descr_stats_2017_non_obj.html', 
           df, 
           non_object_fields, 
           'Descriptive stats for raw non-object fields, 2017',
           nobs,
           '#47d65f'
          )

=== DONE! ====
