In [None]:
import os
import numpy as np
import pandas as pd
from IPython.display import JSON
from dateutil import parser
import isodate

# Environment FIle
from dotenv import load_dotenv
load_dotenv()

# Data visualisation libraries
from googleapiclient.discovery import build

#Data Visual Pakcages
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

#Dissable SSL Veriication - Recommend you delete this 
import ssl
import nltk

# Disable SSL verification
ssl._create_default_https_context = ssl._create_unverified_context

#Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud

# Re-enable SSL verification
ssl._create_default_https_context = ssl.create_default_context

# Data Gathering using Youtube API

In [None]:
#Youtube API Key stored in an env file
api_key = os.getenv('api_key')

# Pewdiepie youtube Channel ID = UC-lHJZR3Gqxm24_Vd_AJ5Yw
# Ali-Abdaal youtube Channel ID = UCoOae5nYA7VqaXzerajD0lg
# David Dobrik youtube Channel ID = UCmh5gdwCx6lN7gEC20leNVA
channel_ids = ['UCmh5gdwCx6lN7gEC20leNVA',
               #more chanels can be placed here if you want
               ]

#Documentation file from youtube V3 data pack to get a users youtube data as a list

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [None]:
#Creating a function that will get all the chanels stats and store it into a df
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    """
    
    all_data = []

    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids) #joins the channel ids if you have more than 1
    )
    response = request.execute()

    #loop through items
    for item in response['items']:
        data = {'ChannelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistID': item['contentDetails']['relatedPlaylists']['uploads']
                
        }
        
        all_data.append(data)

    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    video_ids = []

    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
    
    #Youtube API limits you to 50 requests per page - work around 
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()
        
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
            
        next_page_token = response.get('nextPageToken')
        
    return video_ids


def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
    
    all_video_info = []

    for i in range(0, len(video_ids), 50):  ##Takes all the videos that are present in the playlist
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id = ','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']             
                            }

            video_info = {}
            video_info['video_id'] = video['id']
            
            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    #If some stats are missing from the video then we can have an error management = ignore the missing part
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None
                        
            
            all_video_info.append(video_info)
        
    return pd.DataFrame(all_video_info)


def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

## Using the functions to get the channel stats

In [None]:
channel_stats = get_channel_stats(youtube, channel_ids)

running the 'channel_stats' function to view what it has pulled

In [None]:
channel_stats

Choosing a playlist from the overview

In [None]:
#the channel_id is used as the playlist_id for all the youtubers uploaded videos
#Choose a playlist you would like to analyse
playlist_id = 'UUoOae5nYA7VqaXzerajD0lg'

In [None]:
# Creating a data fram with youtube video stats and the comments from all videos in the playlist
video_ids = get_video_ids(youtube, playlist_id)
video_df = get_video_details(youtube, video_ids)

#The comments usually take the longest time so if you want to save time # this out :D
comments_df = get_comments_in_videos(youtube, video_ids)

In [None]:
video_df.head()

In [None]:
comments_df.head()

In [None]:
# Write video data to CSV file for future references
video_df.to_csv('video_data.csv')
comments_df.to_csv('comments_data.csv')

## Data pre-processing & Feature Engineering

In [None]:
video_df.isnull().any()

In [None]:
video_df.dtypes

In [None]:
#Converting some object columns into numeric
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

In [None]:
#Published day in the week
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x))
video_df['publishedDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))

In [None]:
#Add tag count
video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))
#there are some nulls so nulls will be replaced by 0 

In [None]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [None]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [None]:
#Changing the duration of a video from a string of numbers and letters to a seconds
import isodate
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
##video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]') ### this is not working idk why --- its supposed to convert the 0 days 00:00:00 to 000.0s

In [None]:
#checking the output to see if the duratoin changed to seconds
video_df[['durationSecs', 'duration']]

In [None]:
video_df.head()

# Exploratory Data Analysis

### Best Performing Videos:

In [None]:
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False) [0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000)+ 'K'))

### Worst Performing Videos

In [None]:
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True) [0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000)+ 'K'))

## View Distribution Per Video

In [None]:
sns.violinplot(x=video_df['channelTitle'], y=video_df['viewCount'])


## Views vs Likes and Comments

In [None]:
fig, ax = plt.subplots(1,2)
sns.scatterplot(data = video_df, x = 'commentCount', y = 'viewCount', ax = ax[0])
sns.scatterplot(data = video_df, x = 'likeCount', y = 'viewCount', ax = ax[1])

## Video Duration

In [None]:
sns.histplot(data = video_df, x = 'durationSecs', bins = 30)

### Title length vs views
#### Checking to see if the title length as an indication for views


In [None]:

sns.scatterplot(x = 'titleLength', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True))
# Increase spacing between y-axis ticks
# Set y-axis tick locator to multiples of 100
plt.gca().yaxis.set_major_locator(ticker.MultipleLocator(base=100))
# Invert the y-axis direction
plt.gca().invert_yaxis()

## Wordcloud for video Titles

In [None]:

stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words) 

def plot_cloud(wordcloud):
    plt.figure(figsize=(30, 20))
    plt.imshow(wordcloud) 
    plt.axis("off");

wordcloud = WordCloud(width = 2000, height = 1000, random_state=1, background_color='black', 
                      colormap='viridis', collocations=False).generate(all_words_str)
plot_cloud(wordcloud)


## Upload Schedule

In [None]:
day_df = pd.DataFrame(video_df['publishedDayName'].value_counts())
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_df = day_df.reindex(weekdays)
ax = day_df.plot.bar(rot=0)
ax.set_xlabel('Day')
ax.set_ylabel('Count')



## WordCloud for Video Comments

In [None]:
stop_words = set(stopwords.words('english'))
comments_df['comments_no_stopwords'] = comments_df['comments'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in comments_df['comments_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words) 

wordcloud = WordCloud(width = 2000, height = 1000, random_state=1, background_color='black', 
                      colormap='viridis', collocations=False).generate(all_words_str)
plot_cloud(wordcloud)