# 1. Import libraries and install pre-requistes

In [1]:
api_key = "AIzaSyBDbMUYE9llZJ9XlQhpTK6wKHLqpy_vts0"

In [2]:
pip install --upgrade google-api-python-client isodate wordcloud

Note: you may need to restart the kernel to use updated packages.


In [3]:
from googleapiclient.discovery import build
from dateutil import parser

import isodate
import pandas as pd
import json

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

In [None]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud

In [None]:
channel_ids = [
    "UCW5YeuERMmlnqo4oq8vwUpg", # Net Ninja
    'UCtYLUTtgS3k1Fg4y5tAhLbw', # Statquest
    'UCCezIgC97PvUuR4_gbFUs5g', # Corey Schafer
    'UCfzlCWGWYyIQ0aLC5w48gBQ', # Sentdex
    'UCNU_lfiiWBdtULKOw6X0Dig', # Krish Naik
    'UCzL_0nIe8B4-7ShhVPfJkgw', # DatascienceDoJo
    'UCLLw7jmFsvfIVaUFsLs8mlQ', # Luke Barousse 
    'UCiT9RITQ9PW6BhXK0y2jaeg', # Ken Jee
    'UC7cs8q-gJRlGwj4A8OmCmXg', # Alex the analyst
    'UC2UXDak6o7rBm23k3Vv5dww', # Tina Huang
]

In [None]:
api_service_name = "youtube"
api_version = "v3"
youtube = build(
    api_service_name, api_version, developerKey=api_key
    )

# 2. Data creation with Youtube API

I first created a project on Google Developers Console, then requested an authorization credential (API key). Afterwards, I enabled Youtube API for my application, so that I can send API requests to Youtube API services. Then, I went on Youtube and checked the channel ID of each of the channels that I would like to include in my research scope (using their URLs). Then I created the functions for getting the channel statistics via the API.

In [None]:
def get_Channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    """
    request = youtube.channels().list( 
    part="snippet,contentDetails,statistics",
    id=",".join(channel_ids)
    )
    response = request.execute()
    
    all_data = []
    for item in response["items"]:
        data = {
            "ChannelName": item["snippet"]["title"],
            "subscribers": item["statistics"]["subscriberCount"],
            "views": item["statistics"]["viewCount"],
            "totalVideos": item["statistics"]["videoCount"],
            "playListId": item["contentDetails"]["relatedPlaylists"]["uploads"]
        }
        
        all_data.append(data)
    
    return pd.DataFrame(all_data)

In [None]:
def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    video_ids = []
        
    next_page_token = None
    more_pages = True
    
    while more_pages:
        request = youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token
            )
        response = request.execute()
    
        for i in range(len(response['items'])):
            video_ids.append(
                response['items'][i]['contentDetails']['videoId']
            )
            
            next_page_token = response.get('nextPageToken')
            more_pages = next_page_token is not None
        
    return video_ids

In [None]:
def get_full_info_video(columns_name, video):
    """
    Get Full information about one video
    Params:
        columns_name_to_keep: column names to keep
        video: Data video
    Returns:
    List with statistics of one video, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
    video_info = {}
    video_info['video_id'] = video['id']
    
    for col_parent_name in columns_name.keys():
        for info_data_name in columns_name[col_parent_name]:
            try:
                video_info[info_data_name] = video[col_parent_name][info_data_name]
            except:
                video_info[info_data_name] = None
            
    return video_info

In [None]:
def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_ids[i:i+50]
        )

        response = request.execute()

        for video in response['items']:
            stats_to_keep = {
                "snippet" : [
                    "channelTitle",
                    "title",
                    "description",
                    "tags",
                    "publishedAt"
                ],
                "statistics": [
                    "viewCount",
                    "likeCount",
                    "favoriteCount",
                    "commentCount"
                ],
                "contentDetails": [
                    "duration",
                    "definition",
                    "caption"
                ]
            }

            video_info = get_full_info_video(stats_to_keep, video)

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)

# 3. Channel statistics 

Using the get_channel_stats function defined below, now we are going to obtain the channel statistics for the 10 channels in scope.

### A. Data Exploring (Pre-processing)

In [None]:
channel_df = get_Channel_stats(youtube, channel_ids)

channel_df

In [None]:
channel_df.dtypes

In [None]:
# Convert count columns to numeric columns

numeric_columns = ["views", "subscribers", "totalVideos"]

channel_df[numeric_columns] = channel_df[numeric_columns].astype('int')

# another method
# channel_df[numeric_cols] = channel_df[numeric_cols]
# .apply(pd.to_numeric, errors='coerce') 

In [None]:
channel_df.dtypes

In [None]:
channel_df = channel_df.set_index("ChannelName")

In [None]:
channel_df

### B. Visualisation 

#### The number of subscribers per channel

Let's take a look at the number of subscribers per channel to have a view of how popular the channels are when compared with one another.

In [None]:
print(plt.style.available)
plt.style.use("fivethirtyeight")

In [None]:
plt = channel_df["subscribers"].sort_values(ascending=False).plot(
    kind="bar",
        color=["orange", "pink", "blue", "green", "cyan", "red", "gray", "violet", "crimson"],
    title="the number of subscribers per channel ",
)

plt.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000) + 'K'))
plt

#### the total number of views of the channels

Next, we will look at the rank considering the total number of views of the channels. The rank is fairly similar to the subscriber count rank. Sentdex and Corey Schafer remain the two most popular channels considering both subscribers and views. Interestingly, some channels have more subscribers but less views and vice versa. For example, Ken Jee channel has significantly more subscribers than Luke Barousse channel, but slightly less views in total.

In [None]:
plt_views = channel_df["views"].sort_values(ascending=False).plot(
    kind="bar",
    color=["orange", "pink", "blue", "green", "cyan", "red", "gray", "violet", "crimson"],
    title="The total number of views of the channels",
)

plt_views.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000000) + 'M'))

plt_views

In [None]:
channel_df["totalVideos"].sort_values(ascending=True).plot(
    kind="barh",
    color=["orange", "pink", "blue", "green", "cyan", "red", "gray"],
    title="The total number of videos of the channels",
)

# 4. Video statistics for all the channels

### A. Exploring Data

In [None]:
video_df = pd.DataFrame()
comments_df = pd.DataFrame()

In [None]:
def get_list_ids_from_channel_df(channel_df):
    play_list_id = []
    for c in channel_df.index.unique():
        play_list_id.append(channel_df.loc[c, 'playListId'])
    
    return play_list_id

In [None]:
list_play_list_id = get_list_ids_from_channel_df(channel_df)

In [None]:
%%time
for playListId in list_play_list_id:
    print("Getting information video")
    video_ids = get_video_ids(youtube, playListId)
    video_data = get_video_details(youtube, video_ids)
    
    video_df = pd.concat([video_data, video_df])

In [None]:
video_df

In [None]:
numeric_cols = ['viewCount', 'likeCount', 'favoriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors='coerce', axis=1)

I want to enrich the data for further analyses, for example:

-  create published date column with another column showing the day in the week the video was published, which will be useful for later analysis.

-  convert video duration to seconds instead of the current default string format

-  calculate number of tags for each video

-  calculate comments and likes per 1000 view ratio

-  calculate title character length

In [None]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))

In [None]:
video_df[['publishedAt', 'pushblishDayName']].sample(5)

In [None]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [None]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [None]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [None]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [None]:
video_df.plot.scatter(x= "commentCount", y = "viewCount", subplots=True)
video_df.plot.scatter(x= "likeCount", y = "viewCount", subplots=True)

Now we will take a look at the correlation if we look at the comment ratio and like ratio instead of the absolute number.

In [None]:
video_df.plot.scatter( x = "commentRatio", y = "viewCount")
video_df.plot.scatter( x= "likeRatio", y="viewCount")

As can be seen in the histogram below, most videos are between 300 to 1200 seconds, which is about 5 to 20 minutes. Here I have to limit the duration to 10,000 because of some really long videos (potentially streaming videos).



In [None]:
sns.histplot(data=video_df[video_df['durationSecs'] < 10000], x="durationSecs", bins=20)

<b>Views distribution per channel</b>

With the video statistics for all channel, now we can see how the views are distributed per channel. Some channels might have a lot of views on one of their videos and the rest do not receive many views. Other channels might have more evenly distribution views per video. It can be observed that Corey Schafer, sentdex and Luke Barousse have quite large variance in their views, suggesting that they have a few viral videos. Alex The Analyst, Krish Naik and Data Science Dojo have less views overall but the views are more consistent across videos.

In [None]:
sns.violinplot(data= video_df, x= video_df["channelTitle"], y= video_df['viewCount'])
plt.title('Views per channel', fontsize = 14)
plt.show()

## Wordcloud for words in title

As I'm interested to see what the creators are making videos about and which terms most frequently appear in their video titles, I will create a wordcloud for the most common words. We first need to remove the stopwords such as "you", "I", "the", etc. which do note contribute a lot to the meaning of the title.

In [None]:
stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words) 

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud) 
    plt.axis("off");

wordcloud = WordCloud(width = 2000, height = 1000, random_state=1, background_color='black', 
                      colormap='viridis', collocations=False).generate(all_words_str)
plot_cloud(wordcloud)