# YouTube Data API v3

In [1]:
import requests
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# api_key is stored in config.py
import config
import networkx as nx

import collections

### 1 - Youtube Search (query, n)
    Returns list of channel dictionaries
### 2 - Youtube Channel List
    Returns details on a specific channel
#### A - Request channel details (channelId)
    Returns a json dictionary for a specific channel Id
#### B - Run Channel List (list of channelIds)
    Returns a list of dictionaries per channel

Search method iterating using tokens to get more than 50 results. 

Uses requests to grab type=channel, part=snippet, order=?;

Creates an empty list, extends the list with the json's reponse items (50 at a time).

### Function to to return a list of maxResults parameter for each api request

 For a given number of requests and a specified batch_size, function will return a list of maxResults. 
 
 For example: (123, 50) returns [50,50,23]

In [2]:
def determine_max_result_list(n, batch_size):
    '''Function to return a list of max results parameters for a specific number of requests and batch_sizes.
    
    For example: (123,50) returns [50,50,23]'''
    # Logic to determine maxResults parameter
    if n%50 == 0:
        request_size_list = [batch_size]*(n//batch_size)
    # Determine a list of request sizes []
    else:
        request_size_list = [batch_size]*(n//batch_size) + [n%batch_size]
    return request_size_list

### Function to request a youtube search response for a given query and number of requests

Returns a list of dictionaries, each dictionary represents a specific channel and their subsequent details.<br>
Also create a channelid_list which comes from ```CHANNEL_LIST[0]['id']```

In [3]:
# Search for channels by a specific query
def youtube_request_search_channels(query, n):
    '''Returns a list of n channels that match the query.\n
    Uses /youtube/v3/search'''
    
    request_size_list = determine_max_result_list(n,50)
        
    # Empty list to store 50 items from each response
    channel_list = list()
    
    # Instantiate nextPageToken, when '' method interprets null
    nextPageToken = ''


    for request_size in request_size_list:
        
        # Create request object
        resp = requests.get(
            'https://www.googleapis.com/youtube/v3/search',
            params=dict(part='snippet', 
                        type='channel',
                        maxResults=request_size,
                        pageToken=nextPageToken,
                        q=query,
                        key=config.api_key)
        )
        assert resp.ok
        nextPageToken = json.loads(resp.content)['nextPageToken']
        channel_list.extend(json.loads(resp.content)['items'])
        #print(len(set(channel['id']['channelId'] for channel in channel_list)))
    return channel_list
#CHANNEL_LIST = youtube_request_search_channels('Minecraft',23)

# Extract Channel Ids from the list of dictionaries
#CHANNELID_LIST = [channel['snippet']['channelId'] for channel in CHANNEL_LIST]

### Function to request details on a list of channelIds

Returns a list of channel_resp json dictionaries stored in ```json.content['items']``` across all json responses (if requesting a list larger than 50)

In [4]:
# Get channel details and snippet
def youtube_request_channel_list(channelid_list):
    
    # Check for non-unique elements
    if len(channelid_list) != len(set(channelid_list)):
        print("There are non-unique elements in this list")
        
    # Determine request size list
    request_size_list = determine_max_result_list(len(set(channelid_list)),50)

    # Instantiate channel response list
    channel_resp_list = []
    
    # Set the start_index to 0
    start_index = 0
    # Loop 
    for request_size in request_size_list:
        
        # Debug
        #print(request_size_list, request_size, start_index, start_index+request_size)
        
        resp = requests.get(
            'https://www.googleapis.com/youtube/v3/channels',
            params=dict(part='contentDetails, snippet, statistics,\
                        brandingSettings, topicDetails, status, id, contentOwnerDetails',
                    id=channelid_list[start_index:start_index+request_size],
                    maxResults=50,
                    key=config.api_key)
        )
        # Increase the start_index
        start_index += request_size
        
        # Extend channel response list
        channel_resp_list.extend(json.loads(resp.content)['items'])
        
    return channel_resp_list
#CHANNEL_RESP = youtube_request_channel_list(CHANNELID_LIST)

### Function to flatten dictionary nesting of channel resp for dataframe format

Returns a list of dictionaries where each key in the dictionary maps to a column of interest.<br>
As input takes a list of dictionaries where each dictionary is a specific 'item' from the original json.

In [5]:
def extract_channel_details(channel_resp):
    # Instantite empty channels details list for new dictionary format
    channel_details_list = list()
    
    # Loop over each channel's json dictionary in details_list
    for channel in channel_resp:
        # Instantiate new details dictionary; in series update the dictionary to include relevant details
        channel_details_dict = {}
        channel_details_dict.update(dict(id=channel['id']))
        channel_details_dict.update(channel['snippet'])
        channel_details_dict.update(channel['contentDetails'])
        #channel_details_dict.update(channel['topicDetails'])
        channel_details_dict.update(channel['status'])
        channel_details_dict.update(channel['statistics'])
        channel_details_dict.update(channel['brandingSettings']['channel'])
        
        # Append the added channel's new dictionary format to channel details list
        channel_details_list.append(channel_details_dict)
    return channel_details_list

#CHANNEL_DETAILS_LIST = extract_channel_details(CHANNEL_RESP)

# Insert list of dictionaries into pandas dataframe
#df = pd.DataFrame(CHANNEL_DETAILS_LIST)

# YouTube Channel Details by Search

Returns a details list of channels for a specific search query and number of requested results.

In [7]:
def youtube_channel_details_by_search(query, n):
    '''Returns a details list of channels for a specific search query and number of requested results.'''
    # Request n number of channels from a youtube search query
    channel_list = youtube_request_search_channels(query,n)

    # Retrieve Channel Ids from the list of dictionaries
    channelid_list = [channel['snippet']['channelId'] for channel in channel_list]

    # Request details for all channels in a list of channelIds
    channel_response = youtube_request_channel_list(channelid_list)

    # Extract channel details to a list of dictionaries for pandas
    channel_details_list = extract_channel_details(channel_response)
    
    return extract_channel_details(channel_response)

CHANNEL_DETAILS_LIST = youtube_channel_details_by_search("corridor digital", 20)

In [13]:
# Request n number of channels from a youtube search
CHANNEL_LIST = youtube_request_search_channels('Corridor Digital',20)

# Retrieve Channel Ids from the list of dictionaries
CHANNELID_LIST = [channel['snippet']['channelId'] for channel in CHANNEL_LIST]

# Request details from each channel
CHANNEL_RESP = youtube_request_channel_list(CHANNELID_LIST)

# Extract channel details to a list of dictionaries for pandas
CHANNEL_DETAILS_LIST = extract_channel_details(CHANNEL_RESP)

In [None]:
youtube_channel_details_by_network([CHANNEL_RESP[9]],2)

# Youtube Channel Details by Network

In [36]:
def youtube_channel_details_by_network(channelid_list, max_degree):
    
    # get a response for the list of channel Ids
    channel_resp = youtube_request_channel_list(channelid_list)
    
    # Instantiate the output, a list of dictionaries, each dict represents a channel
    network_channels_resp = []
    
    # Add our origin channel responses
    network_channels_resp.extend(channel_resp)
    
    # Instantiate a neighbors channel response
    neighbors_channels_resp = channel_resp
    
    # Loop over each degree of separate (breadth first search)
    for degree in range(1,max_degree+1):
        
        # Extract a list of featured channels ids
        neighbors_channels_list = extract_featured_channels(neighbors_channels_resp)
        
        # Request channel details from Youtube using list of channel ids
        neighbors_channels_resp = youtube_request_channel_list(neighbors_channels_list)
        
        # Add n-degree channel details response
        network_channels_resp.extend(neighbors_channels_resp)
    
    return network_channels_resp

In [11]:
def extract_featured_channels(channel_response):
    ''' Function to extract a set of featured channelIds from a list of channelIds'''
    
    featured_channels_list = list()
    channels_wo_features_count = 0
    for channel in channel_response:

        if 'featuredChannelsUrls' in channel['brandingSettings']['channel']:
            featured_channels_list.extend(channel['brandingSettings']['channel']['featuredChannelsUrls'])
        else:
            channels_wo_features_count +=1
    print(f'{len(set(featured_channels_list))} neighbors; {channels_wo_features_count} out of {len(channel_response)} channels do not feature channels')
    return list(set(featured_channels_list))

## NetworkX

In [None]:
# Dictionary comprehension to create channelId:featuredChannelUrls data structure
CHANNEL_NETWORK = {channel['id']:channel['featuredChannelsUrls'] if 'featuredChannelsUrls' in channel.keys() else [] for channel in CHANNEL_DETAILS_LIST}

# Dict Comp to create channelId:Channelname data structure
CHANNEL_NAMES = {channel['id']:channel['title'] if 'title' in channel.keys() else '' for channel in CHANNEL_DETAILS_LIST}

In [None]:
def graph_channels(channel_details_list):
    
    # Dictionary comprehension to create channelId:featuredChannelUrls data structure
    channel_network = {channel['id']:channel['featuredChannelsUrls'] \
                   if 'featuredChannelsUrls' in channel.keys() else [] \
                   for channel in channel_details_list}

    # Dict Comp to create channelId:Channelname data structure
    channel_names = {channel['id']:channel['title'] \
                 if 'title' in channel.keys() else '' \
                 for channel in channel_details_list}
    
    G = nx.DiGraph(channel_network)
    plt.figure(figsize = (12,12))
    nx.draw_networkx(G,
                 with_labels=True,
                 labels=channel_names,
                 font_size=12, font_color = 'red')
    return G

In [None]:
def graph_channels(channel_network, channel_names):
    
    G = nx.DiGraph(channel_network)
    plt.figure(figsize = (12,12))
    nx.draw_networkx(G,
                 with_labels=True,
                 labels=channel_names,
                 font_size=12, font_color = 'red')
    return G

In [None]:
G = graph_channels(CHANNEL_NETWORK, CHANNEL_NAMES)

In [None]:
def simple_page_rank():
    A = nx.adjacency_matrix(G)
    n, _ = A.shape
    v0 = np.ones(n) / n
    for i in range(20):
        v1 = A @ v0
        v1 /= v1.sum(0)
        print(np.linalg.norm(v1 - v0))
        v0 = v1
    return v1

In [None]:
V1 = simple_page_rank()

In [None]:
CHANNELIDS = list(G.nodes().keys())
vector = pd.Series(data = V1, index = CHANNELIDS)
DF['v1'] = DF['id'].map(lambda x:vector[x])

In [None]:
len(CHANNELIDS)

In [None]:
DF[['title','id','subscriberCount','viewCount','v1','featuredChannelsCount']].sort_values('v1', ascending=False).head(20)

# Connected Components

In [None]:
import collections
def run():
    sizes = []
    ccs = []
    for cc in nx.connected_components(G.to_undirected()):
        ccs.append(cc)
        sizes.append(len(cc))
    print(sorted(sizes))
    return ccs#collections.Counter(sizes)

CCS = run()

In [None]:
plt.figure(figsize = (12,12))
nx.draw_networkx(G.subgraph(max(nx.connected_components(G.to_undirected()), key=len)),
                 with_labels=False)
                 #labels=CHANNEL_NAMES,
                 #font_size=12, font_color = 'red')

In [None]:
nx.draw_networkx(G.subgraph(max(nx.connected_components(G.to_undirected()), key=len)))

# Exploring the data

In [None]:
def create_dataframe(channel_details_list):
    df = pd.DataFrame(channel_details_list)
    df['videoCount'] = df['videoCount'].map(lambda x: int(x))
    df['commentCount'] = df['commentCount'].map(lambda x: int(x))
    df['viewCount'] = df['viewCount'].map(lambda x: int(x))
    df['subscriberCount'] = df['subscriberCount'].map(lambda x: int(x))
    df['featuredChannelsCount'] = df['featuredChannelsUrls'].apply(lambda x: 0 if type(x) == float else len(x))
    return df

features = ['id','title','description','customUrl','publishedAt','country','isLinked', 'viewCount', 'commentCount', 'subscriberCount',
           'hiddenSubscriberCount','keywords','showRelatedChannels','featuredChannelsUrls', 'featuredChannelsCount']

DF = create_dataframe(CHANNEL_DETAILS_LIST)
DF.shape

In [None]:
DF.info()

In [None]:
DF[features].sort_values(by='subscriberCount', ascending = False).head(5)

In [None]:
sns.distplot(DF['subscriberCount'].map(lambda x: np.log(x+1)))

In [None]:
sns.distplot(DF['viewCount'].map(lambda x: np.log(x+1)))

In [None]:
plt.scatter(DF['subscriberCount'], DF['viewCount'])

In [None]:
plt.barh(DF['title'], DF['subscriberCount'])