# Introduction

This script creates simple interactive plots to visualize how cosmetic brands are reprenseted on social media, specifically Youtube platform, by looking at the numbers of video generated relating to each brand and its related statistics.

In [1]:
import json
import pandas as pd
import dateutil.parser
import plotly.tools as tls

In [2]:
import Config
tls.set_credentials_file(username=Config.plotly_developer_name(), api_key=Config.plotly_developer_key())

# Video data 

We first read json files with attributes of videos and channels to dataframe. For video, we keep informations and statistics about each video as follow:
* video_id: an unique string identifying a video
* video_channelId: an unique string identifying the channel of a video
* video_categoryId: an interget identifying the category of a video
* video_title: displayed title of a video
* video_time: date and time when a video was published
* video_comment: the number of comment of a video
* view_view: the number of view of a video
* keyword: string used in search bar to retrieve the video
* video_like: the number of likes of a video
* video_dislike: the number of dislikes of a video
* video_year: year when video was published
* video_month: month when video was published

In [3]:
with open('video_property.txt') as f:
    video_data = json.load(f)

In [4]:
# Dataframe with video's attributes
video_id = []
video_channelId = []
video_categoryId = []
video_title = []
video_time = []
video_comment = []
video_view = []
keyword = []
video_like = []
video_dislike = []
video_year = []
video_month = []

for brand in video_data.keys():
    for video in video_data[brand]:
        keyword.append(brand)
        try:
            video_id.append(video['items'][0]['id'])
        except:
            video_id.append(None)
        try:
            video_channelId.append(video['items'][0]['snippet']['channelId'])
        except:
            video_channelId.append(None)
        try:
            video_categoryId.append(video['items'][0]['snippet']['categoryId'])
        except:
            video_categoryId.append(None)
        try:
            video_title.append(video['items'][0]['snippet']['title'])
        except:
            video_title.append(None)
        try:
            video_time.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']))
        except:
            video_time.append(None)
        try:
            video_comment.append(int(video['items'][0]['statistics']['commentCount']))
        except:
            video_comment.append(None)
        try:
            video_view.append(int(video['items'][0]['statistics']['viewCount']))
        except:
            video_view.append(None)
        try:
            video_like.append(int(video['items'][0]['statistics']['likeCount']))
        except:
            video_like.append(None)
        try:
            video_dislike.append(int(video['items'][0]['statistics']['dislikeCount']))
        except:
            video_dislike.append(None)
        try:
            video_month.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']).month)
        except:
            video.month.append(None)
        try:
            video_year.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']).year)
        except:
            video_year.append(None)
video_df = pd.DataFrame({'id': video_id, 'channel_id': video_channelId, 'category_id': video_categoryId, 'keyword': keyword,
                   'title': video_title, 'time': video_time, 'comment': video_comment, 'view': video_view,
                  'like': video_like, 'dislike': video_dislike, 'year': video_year, 'month': video_month})
video_df = video_df.drop_duplicates(subset = 'id')

# Only keeps video in categories 'Howto & Style', 'People & Blogs' or 'Entertainment'
cat_chosen = ['26', '22', '24']
video_df = video_df[video_df['category_id'].isin(cat_chosen)]

# Channel data

We read channel information to dataframe. The channel information helps us to determine what type of youtube channel it is based on the number of subscriptions. 


In [5]:
with open('channel_info.txt') as f:
     channel_data = json.load(f)

In [6]:
# Datafram with channel's attributes
channel_sub = []
channel_country = []
channel_title = []
channelId = []
for channel_id in channel_data.keys():
    channelId.append(channel_id)
    try:
        channel_sub.append(int(channel_data[channel_id]['items'][0]['statistics']['subscriberCount']))
    except:
        channel_sub.append(None)
    try:
        channel_country.append(channel_data[channel_id]['items'][0]['snippet']['country'])
    except:
        channel_country.append(None)
    try:
        channel_title.append(channel_data[channel_id]['items'][0]['snippet']['title'])
    except:
        channel_title.append(None)
        
channel_df = pd.DataFrame({'id': channelId, 'title': channel_title, 'subscription_count': channel_sub})
channel_df = channel_df.drop_duplicates(subset = 'id')


# Plotting
Next, we create plots to show how many view, like and dislike brands get from Youtube platform over the month. 

In [7]:
import ipywidgets as widgets
import plotly.plotly as py
from plotly.graph_objs import *
from plotly.widgets import GraphWidget
from IPython.display import display


The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.


IPython.utils.traitlets has moved to a top-level traitlets package.



<IPython.core.display.Javascript object>

We are grouping view, like, and dislike counts by months and plotting the cummulative values for each brand.

In [8]:
cummulative_view = []
cummulative_like = []
cummulative_dislike = []
ts_view = []
ts_like = []
ts_dislike = []

series = pd.DataFrame(video_df[['view','like','dislike','keyword']])
series.index = video_df.time
for brand in series.keyword.unique():
    
    # Time series of view, like, dislike counts by month
    tem = series[series.keyword == brand].resample('M').sum().replace(to_replace='NaN', value=0)
    ts_view.append(graph_objs.Scatter(x=tem.index, y=tem.view, name=brand))
    ts_like.append(graph_objs.Scatter(x=tem.index, y=tem.like, name=brand))
    ts_dislike.append(graph_objs.Scatter(x=tem.index, y=tem.dislike, name=brand))

    
    # Cumulative time series of view, like, dislike counts by month
    tem = series[series.keyword == brand].resample('M').sum().replace(to_replace='NaN', value=0).cumsum()
    cummulative_view.append(graph_objs.Scatter(x=tem.index, y=tem.view, name=brand))
    cummulative_like.append(graph_objs.Scatter(x=tem.index, y=tem.like, name=brand))
    cummulative_dislike.append(graph_objs.Scatter(x=tem.index, y=tem.dislike, name=brand))


#### Cummulative view counts

The graph shows cummulative view counts for each brand. View counts are grouped by month and year. Note that we can only obtain total view counts of a video and not the number of views at the time the video was produced.

In [43]:
figure = Figure(data=Data(cummulative_view), layout=Layout(title='Cumulative view per month'))
py.iplot(figure, filename='cumlative-view')

#### Cummulative like counts

Similarly, the next graph shows cummulative like counts for each brand.

In [44]:
figure = Figure(data=Data(cummulative_like), layout=Layout(title='Cumulative like per month'))
py.iplot(figure, filename='cumulative-like')

#### Cummulative dislike counts

And this shows cummulative dislike counts.

In [22]:
figure = Figure(data=Data(cummulative_dislike), layout=Layout(title='Cumulative dislike per month'))
py.iplot(figure, filename='cumulative-dislike')

CoverGirl and Real Techniques are the two brands that seem to be have highest view counts among all. 

Real Techniques is an interesting case. If you haven't known, it is the brand created by Samantha and Nicola Chapman, who were the creators of Pixiwoo channel and part of the first beauty guru generation with over two millions subscriber up to date. I know they are popular but didn't aticipated this level of high view.

Besides, Colourpop and Morphe are two new names in the industry and they seem to quickly catch up other well established brands such as Urban Decay and and Maybelline.

CoverGirl has highest cummulative view counts but their like counts graph is less steep than the rest.

In [46]:
figure = Figure(data=Data(ts_view), layout=Layout(title='Views per month'))
py.iplot(figure, filename='ts-view')

In [15]:
figure = Figure(data=Data(ts_like), layout=Layout(title='Likes per month'))
py.iplot(figure, filename='ts-like')

In [16]:
figure = Figure(data=Data(ts_dislike), layout=Layout(title='Dislikes per month'))
py.iplot(figure, filename='ts-dislike')

#### Pie Plot: Small Youtuber vs. Influencer
Next, we could see how views and number of view count driven by different types of Youtuber. Youtube gives award when Youtuber reaches 100k, 1 mil and 10 mil subsrcibers. Therefore, we have four types of youtuber. 

* Small Youtuber: youtubers with less than 100,000 subscriber
* Silver play: youtubers with the less than 1 million subscribers and more than or equal to 100,000 subscribers
* Gold play: youtubers with less than 10 million subscribers and more than or equal to 1 million
* Diamond play: youtubers with more than or equal to 10 million subscribers.

We will look at the cases of Real Technique and Maybelline. There are two pie plots for each brand. First one shows the proportion of videos made by Youtuber types. Second one shows the proportion of views made up by Youtuber types.

In [9]:
# set up figure
fig = {
    'data':[
        {
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [0, .45]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'Video count'
            
        },
        {
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [.55, 1]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'View count'
            
        }
    ]
}

In [13]:
# function plot
# input brand name
def pie_plot(key, filename):
    noneplay = none_view = silverplay = silver_view = goldplay = gold_view = diamondplay = diamond_view = 0.0

    tem_df = video_df[(video_df.keyword == key)]['id']
    for video_id in tem_df:
        channel = video_df[video_df['id'] == video_id].channel_id.item()
        view = video_df[video_df['id'] == video_id].view.item()
        tem = channel_df[channel_df['id'] == channel]['subscription_count'].item()

        if tem < 100000:
            noneplay += 1
            none_view += view
        elif tem < 1000000:
            silverplay += 1
            silver_view += view
        elif tem < 10000000:
            goldplay += 1
            gold_view += view
        elif tem >= 10000000:
            diamondplay += 1
            diamond_view += view

    fig['data'][0]['values'] = [noneplay, silverplay, goldplay, diamondplay]
    fig['data'][1]['values'] = [none_view, silver_view, gold_view, diamond_view]
    fig['layout'] = {'title':'Pie plot for %s' %key}
    return display(py.iplot(fig, filename=filename))

In [14]:
pie_plot('real_technique', filename='pie-realtechnique')

In [15]:
pie_plot('maybelline', filename='pie-maybelline')

Even though small Youtubers produce majority of videos, most of the views are generated from big Youtubers. Real Technique seems to have about 80% views from big Youtubers while less than 30% videos made by them. Same case applies to Maybelline.

#### Interactive widget for pie plot

Lastly, we create interactive widgets to show throughout the years:
1. The percentage of videos made by each type of Youtuber
2. The percentage of views generated by each type of Youtuber

Some brands have longer history than others. Therefore, if no chart is generated for any year for a brand, it means we do not have data to display. For the two most recent years of 2015 and 2016, we should have data for all brands.

The widgets require Python kernel to show up.

In [35]:
# Define widgets
year_min = video_df.year.min()
year_max = video_df.year.max()
year_slide = widgets.widget_int.IntSlider(min = year_min, max = year_max)
year_slide.description = 'Year'
year_slide.value = year_min

brand_button = widgets.widget_selection.RadioButtons()
brand_button.description = 'Brand'
brand_button.options = {'Colourpop': 'colourpop',
                       'CoverGirl': 'cover_girl',
                       'Maybelline': 'maybelline',
                       'Morphe': 'morphe',
                       'Real Technique': 'real_technique',
                       'Urban Decay': 'urban_decay'}

# Set up plot
fig = {
    'data':[
        {
            'values': [0,0,0,0],
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [0, .45]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'Video count'
            
        },
        {
            'values': [0,0,0,0],
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [.55, 1]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'View count'
            
        }
    ]
}

url = py.plot(fig, auto_open=False, filename = 'Pie_plot')
pie_count = GraphWidget(url)

# Define what happens when widget's values change
def on_trait_change(change):
    year_value = year_slide.value
    brand_value = brand_button.value
    
    
    # Replot the pie on input change

    noneplay = none_view = silverplay = silver_view = goldplay = gold_view = diamondplay = diamond_view = 0.0
    
    tem_df = video_df[(video_df.year == year_value) & (video_df.keyword == brand_value)]['id']
    if tem_df.empty == False:
        for video_id in tem_df:
            channel = video_df[video_df['id'] == video_id].channel_id.item()
            view = video_df[video_df['id'] == video_id].view.item()
            tem = channel_df[channel_df['id'] == channel]['subscription_count'].item()

            if tem < 100000:
                noneplay += 1
                none_view += view
            elif tem < 1000000:
                silverplay += 1
                silver_view += view
            elif tem < 10000000:
                goldplay += 1
                gold_view += view
            elif tem >= 10000000:
                diamondplay += 1
                diamond_view += view
                
    pie_count.restyle({
            'values':[[str(noneplay), str(silverplay), str(goldplay), str(diamondplay)],
                     [str(none_view), str(silver_view), str(gold_view), str(diamond_view)]],
        })
year_slide.observe(on_trait_change, names='value')
brand_button.observe(on_trait_change, names='value')

In [36]:
display(year_slide)
display(brand_button)
display(pie_count)