# Introduction

This script creates simple interactive plots to visualize how cosmetic brands are reprenseted on social media, specifically Youtube platform, by looking at the numbers of video generated relating to each brand and its related statistics.

In [44]:
import json
import pandas as pd
import dateutil.parser
import plotly.tools as tls

In [45]:
import Config
tls.set_credentials_file(username=Config.plotly_developer_name, api_key=Config.plotly_developer_key)

# Video data 

We first read json files with attributes of videos and channels to dataframe. For video, we keep informations and statistics about each video as follow:
* video_id: an unique string identifying a video
* video_channelId: an unique string identifying the channel of a video
* video_categoryId: an interget identifying the category of a video
* video_title: displayed title of a video
* video_time: date and time when a video was published
* video_comment: the number of comment of a video
* view_view: the number of view of a video
* keyword: string used in search bar to retrieve the video
* video_like: the number of likes of a video
* video_dislike: the number of dislikes of a video
* video_year: year when video was published
* video_month: month when video was published

In [46]:
with open('video_property.txt') as f:
    video_data = json.load(f)

In [47]:
# Dataframe with video's attributes
video_id = []
video_channelId = []
video_categoryId = []
video_title = []
video_time = []
video_comment = []
video_view = []
keyword = []
video_like = []
video_dislike = []
video_year = []
video_month = []

for brand in video_data.keys():
    for video in video_data[brand]:
        keyword.append(brand)
        try:
            video_id.append(video['items'][0]['id'])
        except:
            video_id.append(None)
        try:
            video_channelId.append(video['items'][0]['snippet']['channelId'])
        except:
            video_channelId.append(None)
        try:
            video_categoryId.append(video['items'][0]['snippet']['categoryId'])
        except:
            video_categoryId.append(None)
        try:
            video_title.append(video['items'][0]['snippet']['title'])
        except:
            video_title.append(None)
        try:
            video_time.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']))
        except:
            video_time.append(None)
        try:
            video_comment.append(int(video['items'][0]['statistics']['commentCount']))
        except:
            video_comment.append(None)
        try:
            video_view.append(int(video['items'][0]['statistics']['viewCount']))
        except:
            video_view.append(None)
        try:
            video_like.append(int(video['items'][0]['statistics']['likeCount']))
        except:
            video_like.append(None)
        try:
            video_dislike.append(int(video['items'][0]['statistics']['dislikeCount']))
        except:
            video_dislike.append(None)
        try:
            video_month.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']).month)
        except:
            video.month.append(None)
        try:
            video_year.append(dateutil.parser.parse(video['items'][0]['snippet']['publishedAt']).year)
        except:
            video_year.append(None)
video_df = pd.DataFrame({'id': video_id, 'channel_id': video_channelId, 'category_id': video_categoryId, 'keyword': keyword,
                   'title': video_title, 'time': video_time, 'comment': video_comment, 'view': video_view,
                  'like': video_like, 'dislike': video_dislike, 'year': video_year, 'month': video_month})
video_df = video_df.drop_duplicates(subset = 'id')

# Channel data

We read channel information to dataframe. The channel information helps us to determine what type of youtube channel it is based on the number of subscriptions. Youtube gives out awards when Youtubers reaches 100k, 1 mil and 10 mil subsrcibers. Therefore, we have four types of youtuber. 

* Small Youtuber: youtubers with less than 100,000 subscriber
* Silver play: youtubers with the less than 1 million subscribers and more than or equal to 100,000 subscribers
* Gold play: youtubers with less than 10 million subscribers and more than or equal to 1 million
* Diamond play: youtubers with more than or equal to 10 million subscribers.

The last three types are considered as youtuber influencer.

In [48]:
with open('channel_info.txt') as f:
     channel_data = json.load(f)

In [49]:
# Datafram with channel's attributes
channel_sub = []
channel_country = []
channel_title = []
channelId = []
for channel_id in channel_data.keys():
    channelId.append(channel_id)
    try:
        channel_sub.append(int(channel_data[channel_id]['items'][0]['statistics']['subscriberCount']))
    except:
        channel_sub.append(None)
    try:
        channel_country.append(channel_data[channel_id]['items'][0]['snippet']['country'])
    except:
        channel_country.append(None)
    try:
        channel_title.append(channel_data[channel_id]['items'][0]['snippet']['title'])
    except:
        channel_title.append(None)
        
channel_df = pd.DataFrame({'id': channelId, 'title': channel_title, 'subscription_count': channel_sub})
channel_df = channel_df.drop_duplicates(subset = 'id')


# Plotting
Next, we create plots to show how many view, like and dislike brands get from Youtube platform over the month. We will also create widgets to show the how view counts driven by different type of Youtuber. Specifically, we are interested in two type of Youtubers, the Influencers and small Youtubers.

In [50]:
import ipywidgets as widgets
import plotly.plotly as py
from plotly.graph_objs import *
from plotly.widgets import GraphWidget
from IPython.display import display

We are grouping view, like, and dislike counts by months and plotting the cummulative values for each brand.

In [108]:
cummulative_view = []
cummulative_like = []
cummulative_dislike = []

series = pd.DataFrame(video_df[['view','like','dislike','keyword']])
series.index = video_df.time
for brand in series.keyword.unique():
    tem = series[series.keyword == brand].resample('M', how='sum').replace(to_replace='NaN', value=0).cumsum()
    cummulative_view.append(graph_objs.Scatter(x=tem.index, y=tem.view, name=brand))
    cummulative_like.append(graph_objs.Scatter(x=tem.index, y=tem.like, name=brand))
    cummulative_dislike.append(graph_objs.Scatter(x=tem.index, y=tem.dislike, name=brand))
    

#### Cummulative view counts

In [111]:
py.iplot(cummulative_view)

#### Cummulative like counts

In [112]:
py.iplot(cummulative_like)

#### Cummulative dislike counts

In [113]:
py.iplot(cummulative_dislike)

CoverGirl and Real Techniques are the two brands that seem to be donimant over the rest. 

CoverGirl presentation on Youtube is quite a surprise for me. As a long time follower of this community, I personally feel like I didn't hear and see much about the brand from my subscribed gurus. Initially, I chose CoverGirl and Maybelline to be in this set of brands since I want to see how they perform compared to other new established brands such as Morphe, Real Technique and Colourpop. My initial thought were they would be slightly higher than the young names and Maybelline would be higher than CoverGirl. 

Real Techniques is another interesting case. If you haven't known, it is the brand created by Samantha and Nicola Chapman, who were the creators of Pixiwoo channel and part of the first beauty guru generation with over two millions subscriber up to date. I know they are popular but didn't aticipated this level of high view.

For the rest of the brands, Colourpop and Morphe are two new names in the industry and they seem to quickly catch up with two well established brands Urban Decay and and Maybelline.


In [115]:
# Define widgets
month_slide = widgets.widget_int.IntSlider(min=1, max=12)
month_slide.description = 'Month'
month_slide.value = 1

year_min = video_df.year.min()
year_max = video_df.year.max()
year_slide = widgets.widget_int.IntSlider(min = year_min, max = year_max)
year_slide.description = 'Year'
year_slide.value = year_min

brand_button = widgets.widget_selection.RadioButtons()
brand_button.description = 'Brand'
brand_button.options = {'Colourpop': 'colourpop',
                       'CoverGirl': 'cover_girl',
                       'Maybelline': 'maybelline',
                       'Morphe': 'morphe',
                       'Real Technique': 'real_technique',
                       'Urban Decay': 'urban_decay'}

# Set up plot
fig = {
    'data':[
        {
            'values': [0,0,0,0],
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [0, .45]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'Video count'
            
        },
        {
            'values': [0,0,0,0],
            'labels': ['Small Youtuber', 'Silver Play', 'Gold Play', 'Diamond Play'],
            'domain':{'x': [.55, 1]},
            'type':'pie',
            'direction': 'clockwise',
            'sort': False,
            'name': 'View count'
            
        }
    ]
#     'layout':{
#         'annotations':[
#             {
#                 'font':{
#                     'size': 25
#                     },
# #                 'text': 'Number of video',
#                 'x': 0.1 ,
#                 'y': 0,
#                 'showarrow': False
#         },
#             {
#                 'font':{
#                     'size': 25
#                 },
# #                 'text': 'Number of view',
#                 'x': 0.9,
#                 'y': 0,
#                 'showarrow': False
#         }
#     ]
#     }
}

url = py.plot(fig, auto_open=False, filename = 'Pie_plot')
pie_count = GraphWidget(url)

# Define what happens when widget's values change
def on_trait_change(change):
    month_value = month_slide.value
    year_value = year_slide.value
    brand_value = brand_button.value
    
    
    # Replot the pie on input change
       
    noneplay = 0
    none_view = 0
    silverplay = 0
    silver_view = 0
    goldplay = 0
    gold_view = 0
    diamondplay = 0
    diamond_view = 0
    
    tem_df = video_df[(video_df.year == year_value) & (video_df.keyword == brand_value)]['id']
    if tem_df.empty == False:
        for video_id in tem_df:
            channel = video_df[video_df['id'] == video_id].channel_id.item()
            view = video_df[video_df['id'] == video_id].view.item()
            tem = channel_df[channel_df['id'] == channel]['subscription_count'].item()

            if tem < 100000:
                noneplay += 1
                none_view += view
            elif tem < 1000000:
                silverplay += 1
                silver_view += view
            elif tem < 10000000:
                goldplay += 1
                gold_view += view
            elif tem >= 10000000:
                diamondplay += 1
                diamond_view += view
                
    pie_count.restyle({
            'values':[[str(noneplay), str(silverplay), str(goldplay), str(diamondplay)],
                     [str(none_view), str(silver_view), str(gold_view), str(diamond_view)]],
        })

    
month_slide.observe(on_trait_change, names='value')
year_slide.observe(on_trait_change, names='value')
brand_button.observe(on_trait_change, names='value')

For each brand, we create interactive pie plots to show throughout the years
1. The percentage of videos made by each type of Youtuber (left)
2. The percentage of views generated by each type of Youtuber (right)

Some brands have longer history than others. Therefore, if no chart is generated for any year for a brand, it means we do not have data to display. For the two most recent years of 2015 and 2016, we should have data for all brands.

In [116]:
# Left pie plot shows percentage of videos made by each type of youtuber
# Right pie plot shows percentage of views made up by each type of youtuber
display(year_slide)
display(brand_button)
display(pie_count)