# A) Importing Required Modules & Packages

In [1]:
import pandas as pd
import numpy as np
from dateutil import parser 

# Modules for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

import time
import requests # To make API calls

* Installing Google API Client

In [2]:
pip install --upgrade google-api-python-client

Collecting google-api-python-client
  Downloading google_api_python_client-2.76.0-py2.py3-none-any.whl (11.0 MB)
Installing collected packages: google-api-python-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.74.0
    Uninstalling google-api-python-client-2.74.0:
      Successfully uninstalled google-api-python-client-2.74.0
Successfully installed google-api-python-client-2.76.0
Note: you may need to restart the kernel to use updated packages.


* Installing WordCloud Package

In [3]:
pip install wordcloud

Note: you may need to restart the kernel to use updated packages.


* Installing isodate Package

In [4]:
pip install isodate

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Importing Modules

# Google API Module
from googleapiclient.discovery import build

# Module to display JSON Object
from IPython.display import JSON 

# Module to Perform Date Operations

import isodate

# NLP Modules

import nltk
from wordcloud import WordCloud

In [6]:
# Youtube API Key
api_key='AIzaSyDFefM7HhGhV9XidV-X3CzEM7RAnqeEX00'

# B) Extracting Data from Youtube API

In [9]:
# Specifying Channel ID
channel_ids= ["UCYO_jab_esuFRV4b17AJtAw", # 3Blue 1Brown
             "UCNU_lfiiWBdtULKOw6X0Dig",  # Krish Naik
             "UCiT9RITQ9PW6BhXK0y2jaeg",  # Ken Jee
             "UCtYLUTtgS3k1Fg4y5tAhLbw", # StatQuest with Josh Starmer
             "UC2UXDak6o7rBm23k3Vv5dww", # Tina Huang
             "UC7cs8q-gJRlGwj4A8OmCmXg", # Alex the Analyst
             "UCFp1vaKzpfvoGai0vE5VJ0w", # Guy in a Cube 
            "UCLLw7jmFsvfIVaUFsLs8mlQ",  # Luke Barousse
            "UCCezIgC97PvUuR4_gbFUs5g", # Corey Schafer
            "UCfzlCWGWYyIQ0aLC5w48gBQ", # Sentdex
            ] 

In [10]:
# Creating an API Client
api_service_name = "youtube"
api_version = "v3"

youtube = build(
api_service_name, api_version, developerKey=api_key)

## Defining User Defined Functions to Extract Required Data

In [11]:
# User Defined Function to Extract Channel Statistics
def get_channel_stats(youtube, channel_ids):
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(Channel_Name = response['items'][i]['snippet']['title'],
                    Subscribers = response['items'][i]['statistics']['subscriberCount'],
                    Total_Views = response['items'][i]['statistics']['viewCount'],
                    Total_Videos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

In [12]:
# User Defined Function to Extract Video Ids
def get_video_ids(youtube, playlist_id):
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

In [13]:
# User Defined Function to Extract Video Statistics
def get_video_stats(youtube, video_ids):
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            video_stats = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']
            for k in video_stats.keys():
                for v in video_stats[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    return all_video_info    

In [14]:
# User Defined Function to Extract Comments
def get_comments_in_videos(youtube, video_ids):
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # We use 'except' to consider situations when error occurs - most likely due to comments being disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

## Extracting Data Using User-Defined Functions

In [15]:
# Extracting Channel Statistics
channel_stats = get_channel_stats(youtube,channel_ids)

In [16]:
# Printing Data Science Channel Statistics
channel_stats

Unnamed: 0,Channel_Name,Subscribers,Total_Views,Total_Videos,playlistId
0,Ken Jee,234000,7529810,253,UUiT9RITQ9PW6BhXK0y2jaeg
1,3Blue1Brown,4950000,319695906,127,UUYO_jab_esuFRV4b17AJtAw
2,Luke Barousse,305000,13824681,111,UULLw7jmFsvfIVaUFsLs8mlQ
3,Alex The Analyst,364000,14251266,192,UU7cs8q-gJRlGwj4A8OmCmXg
4,sentdex,1210000,107732602,1234,UUfzlCWGWYyIQ0aLC5w48gBQ
5,Guy in a Cube,321000,25044467,873,UUFp1vaKzpfvoGai0vE5VJ0w
6,Tina Huang,453000,18733093,116,UU2UXDak6o7rBm23k3Vv5dww
7,StatQuest with Josh Starmer,874000,45115018,237,UUtYLUTtgS3k1Fg4y5tAhLbw
8,Krish Naik,715000,69655370,1620,UUNU_lfiiWBdtULKOw6X0Dig
9,Corey Schafer,1050000,82339730,231,UUCezIgC97PvUuR4_gbFUs5g


In [17]:
# Checking Datatypes of the Variables
channel_stats.dtypes

Channel_Name    object
Subscribers     object
Total_Views     object
Total_Videos    object
playlistId      object
dtype: object

In [18]:
# Converting Variables to Numeric Datatype
num_cols=['Subscribers','Total_Views','Total_Videos']
channel_stats[num_cols]=channel_stats[num_cols].apply(pd.to_numeric,errors='coerce')

In [19]:
# Extracting Video Statistics & Comments

video_df = pd.DataFrame()
comments_df = pd.DataFrame()

for c in channel_stats['Channel_Name'].unique():
    print("Getting Video Information from the channel: " + c)
    playlist_id = channel_stats.loc[channel_stats['Channel_Name']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # Extracting Video Data
    video_data = get_video_stats(youtube, video_ids)
    # Extracting Comments Data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # Appending Video data together and Comments data toghether
    video_df = video_df.append(video_data, ignore_index=True)
    comments_df = comments_df.append(comments_data, ignore_index=True)

Getting Video Information from the channel: Ken Jee
Getting Video Information from the channel: 3Blue1Brown
Getting Video Information from the channel: Luke Barousse
Getting Video Information from the channel: Alex The Analyst
Getting Video Information from the channel: sentdex
Could not get comments for video XfzbZm89z3o
Getting Video Information from the channel: Guy in a Cube
Getting Video Information from the channel: Tina Huang
Getting Video Information from the channel: StatQuest with Josh Starmer
Could not get comments for video BgxBEKhaqyQ
Getting Video Information from the channel: Krish Naik
Could not get comments for video sjq1OhMzSSU
Getting Video Information from the channel: Corey Schafer
Could not get comments for video ZB7c00QgmdQ


In [20]:
# Printing Video Data
video_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,5AwbPAV8FS0,Ken Jee,How Much Money Does a Data Scientist Make? #sh...,#DataScience #KenJee \nWatch the Full Video He...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-02-03T12:00:03Z,2026,58,,2,PT40S,hd,False
1,A2OQgU0IRNo,Ken Jee,These 3 Things Make Data Science Special #shorts,#DataScience #KenJee \nWatch the full video he...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-02-01T12:00:12Z,1784,117,,3,PT59S,hd,False
2,x5Q_nrIqFik,Ken Jee,Will Data Science Be Automated? #shorts,#DataScience #KenJee \nFull Video: https://www...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-27T11:00:44Z,4570,166,,4,PT37S,hd,False
3,7ja2ELBVMlU,Ken Jee,Is Data Science A Good Fit For You? #shorts,#DataScience #KenJee \n\nFull Video: https://w...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-25T11:00:05Z,3860,160,,6,PT39S,hd,False
4,Z2MyLQa_vVw,Ken Jee,5 Books I Wish I Read Earlier In My Career,#datascience #KenJee \n\nThese 5 books change...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-23T15:31:48Z,5619,357,,44,PT3M10S,hd,False


In [21]:
# Printing Comments Data
comments_df.head()

Unnamed: 0,video_id,comments
0,5AwbPAV8FS0,"[Faker Scientist < ML Engineer, Could you make..."
1,A2OQgU0IRNo,"[Great short, Thanks for your advice, Thanks f..."
2,x5Q_nrIqFik,[Accounting is literally at high risk of autom...
3,7ja2ELBVMlU,"[What do you mean by number 1?, Number 1 is ac..."
4,Z2MyLQa_vVw,[Can you find the secret papaya we hid in the ...


# C) Pre-Processing & Feature Engineering

In [22]:
# Checking for Missing Values
video_df.isnull().sum()

video_id             0
channelTitle         0
title                0
description          0
tags               347
publishedAt          0
viewCount            0
likeCount            0
favouriteCount    4930
commentCount         1
duration             0
definition           0
caption              0
dtype: int64

In [23]:
# Checking for Missing Values
comments_df.isnull().sum()

video_id    0
comments    0
dtype: int64

In [25]:
# Dropping the Variable 'favouriteCount' with NULL values
video_df=video_df.drop(['favouriteCount'],axis=1)

In [26]:
# Sorting the DataFrame based on the dates the videos were Published
video_df.publishedAt.sort_values()

1916    2013-05-10T19:05:44Z
1915    2013-05-16T21:24:52Z
1914    2013-05-24T18:05:43Z
1913    2013-06-08T21:13:31Z
1912    2013-06-10T14:57:25Z
                ...         
380     2023-02-03T16:00:22Z
3141    2023-02-03T16:13:26Z
3140    2023-02-04T13:39:17Z
1917    2023-02-04T16:52:06Z
3139    2023-02-05T06:37:51Z
Name: publishedAt, Length: 4930, dtype: object

In [27]:
# Checking Datatypes of the Variables
video_df.dtypes

video_id        object
channelTitle    object
title           object
description     object
tags            object
publishedAt     object
viewCount       object
likeCount       object
commentCount    object
duration        object
definition      object
caption         object
dtype: object

In [28]:
# Converting Variables to Numeric Datatype
num_cols=['viewCount', 'likeCount', 'commentCount']
video_df[num_cols]=video_df[num_cols].apply(pd.to_numeric,errors='coerce')

### Feature Engineering

In [29]:
# Creating a Variable to store which day of the week the video was published
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['published_on'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))

In [30]:
# Creating a Variable to store the Duration of the Videos in Seconds
video_df['duration_in_Secs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['duration_in_Secs'] = video_df['duration_in_Secs'].astype('timedelta64[s]')

In [31]:
# Creating a Variable to store the Number of Tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [32]:
# Creating a Variable to Store the Length of the Title (in terms of number of characters)
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [33]:
# Creating a Variable to Store the Year (when the Video was published)
video_df['year'] = video_df['publishedAt'].dt.year

In [34]:
# Printing the DataFrame
video_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,published_on,duration_in_Secs,tagsCount,titleLength,year
0,5AwbPAV8FS0,Ken Jee,How Much Money Does a Data Scientist Make? #sh...,#DataScience #KenJee \nWatch the Full Video He...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-02-03 12:00:03+00:00,2026,58,2.0,PT40S,hd,False,Friday,40.0,8,50,2023
1,A2OQgU0IRNo,Ken Jee,These 3 Things Make Data Science Special #shorts,#DataScience #KenJee \nWatch the full video he...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-02-01 12:00:12+00:00,1784,117,3.0,PT59S,hd,False,Wednesday,59.0,7,48,2023
2,x5Q_nrIqFik,Ken Jee,Will Data Science Be Automated? #shorts,#DataScience #KenJee \nFull Video: https://www...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-27 11:00:44+00:00,4570,166,4.0,PT37S,hd,False,Friday,37.0,9,39,2023
3,7ja2ELBVMlU,Ken Jee,Is Data Science A Good Fit For You? #shorts,#DataScience #KenJee \n\nFull Video: https://w...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-25 11:00:05+00:00,3860,160,6.0,PT39S,hd,False,Wednesday,39.0,9,43,2023
4,Z2MyLQa_vVw,Ken Jee,5 Books I Wish I Read Earlier In My Career,#datascience #KenJee \n\nThese 5 books change...,"[Data Science, Ken Jee, Machine Learning, Spor...",2023-01-23 15:31:48+00:00,5619,357,44.0,PT3M10S,hd,False,Monday,190.0,16,42,2023


In [35]:
# Summary of the DataFrame
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4930 entries, 0 to 4929
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype                  
---  ------            --------------  -----                  
 0   video_id          4930 non-null   object                 
 1   channelTitle      4930 non-null   object                 
 2   title             4930 non-null   object                 
 3   description       4930 non-null   object                 
 4   tags              4583 non-null   object                 
 5   publishedAt       4930 non-null   datetime64[ns, tzutc()]
 6   viewCount         4930 non-null   int64                  
 7   likeCount         4930 non-null   int64                  
 8   commentCount      4929 non-null   float64                
 9   duration          4930 non-null   object                 
 10  definition        4930 non-null   object                 
 11  caption           4930 non-null   object                 
 12  publis

In [36]:
# Exporting DataFrame as a .CSV file
video_df.to_csv('Data_Science_Channels.csv')

In [38]:
channel_stats.to_csv('Channel_Stats.csv')