# Generating Youtube pdf Report 

# Necessary packages 

In [349]:
# importing packages 

import numpy as np
import pandas as pd 

from datetime import datetime
import seaborn as sns
from matplotlib import pyplot as plt, dates as mdates
from wordcloud import WordCloud

from googleapiclient.discovery import build
from fpdf import FPDF

# Getting Youtube channel statistics with API 

In [359]:
channel_id = 'UCq6XkhO5SZ66N04IcPbqNcw'                 
api_key = 'YOUR API KEY'    
#server 
youtube = build('youtube' , 'v3', developerKey=api_key)             

In [361]:
def get_info(youtube, channel_id):
    
    #requesting general info about the channel: 
    
    request = youtube.channels().list(
                        part="snippet,contentDetails,statistics, topicDetails, status",
                        id=channel_id)
    response = request.execute()
    
    #creating a dictionary to store needed information (+ changing formats of data):
    
    general_info = dict(Date_of_extraction = datetime.now().strftime("%d/%m/%Y %H:%M"),
               Channel_name = response['items'][0]['snippet']['title'],
               Date_created = response['items'][0]['snippet']['publishedAt'],        
               Subscribers = pd.to_numeric(response['items'][0]['statistics']['subscriberCount']),
               Views = pd.to_numeric(response['items'][0]['statistics']['viewCount']),
               Number_of_videos = pd.to_numeric(response['items'][0]['statistics']['videoCount']))
    
    #saving playlist id as a separate value to find all videos of the channel later:  
     
    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']   
        
    # getting videos ids 
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
            
    #now with each video id, videos statistics can be found: 
    
    all_video_stats = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part='snippet,statistics',
                    id=','.join(video_ids[i:i+50]))
        response = request.execute()
        
        for video in response['items']:
            video_stats = dict(Title = video['snippet']['title'],
                               Published_date = pd.to_datetime(video['snippet']['publishedAt']), 
                               Views = pd.to_numeric(video['statistics']['viewCount']),
                               Likes = pd.to_numeric(video['statistics']['likeCount']),
                               Comments = pd.to_numeric(video['statistics']['commentCount']))
            #checking if the video has tags: 
            if 'tags' in video['snippet']:
                video_stats['Tags'] = video['snippet']['tags']
            
            all_video_stats.append(video_stats)
    
    #turning dictionary to panda dataframe: 
    
    panda_all_video_stats = pd.DataFrame(all_video_stats)
   
    panda_all_video_stats['Published_date'] = pd.to_datetime(panda_all_video_stats['Published_date']).dt.tz_localize(None).dt.normalize()
            
    return general_info, panda_all_video_stats         

In [362]:
info, vid_info = get_info(youtube, channel_id)

{'Date_of_extraction': '20/07/2022 09:06',
 'Channel_name': 'Keith Galli',
 'Date_created': '2016-12-27T20:18:51Z',
 'Subscribers': 166000,
 'Views': 11175307,
 'Number_of_videos': 74}

# Creating PDF

In [363]:
# titel
def title(pdf): 
    pdf.set_font('Times', 'B', 24)
    pdf.ln(10)
    pdf.write(5, 'Report Analysis of Youtube Channel')
    pdf.ln(10)

In [None]:
#general statistics     
def main_stat(pdf, info = info):
    pdf.set_font('Times', 'I', 12)
    pdf.write(5, 'The data was extracted on ' + str(info['Date_of_extraction']))
    pdf.ln(9)
    
    pdf.set_font('Times', '', 12)
    pdf.write(5, 'Name of the Channel: ' + str(info['Channel_name']))
    pdf.ln(6)
    pdf.write(5, 'The Creation Date: ' + str(info['Date_created'][:10]))
    pdf.ln(6)
    pdf.write(5, 'Total Subscribers: ' + str(info['Subscribers']))
    pdf.ln(6)
    pdf.write(5, 'Total Amount of Views: ' + str(info['Views']))
    pdf.ln(6)
    pdf.write(5, 'Total Number of Videos: ' + str(info['Number_of_videos']))
    pdf.ln(10)

In [None]:
#top 10 videos views 
def view_graph(pdf, vid_info = vid_info):
    #creating graph 
    top_10_view = vid_info.sort_values('Views', ascending = False).head(10)
    chart1 = sns.barplot(x = top_10_view['Views'], y = top_10_view['Title'], color = 'red')
    chart1.set(title = 'Top 10 Videos by Views')
    plt.savefig('chart1.png',  bbox_inches='tight')
    plt.close()
    #plotting it 
    pdf.image('chart1.png', x = 0, w = WIDTH)
    pdf.ln(2)

In [None]:
def like_graph(pdf, vid_info = vid_info):
    #creating graph
    top_10_likes = vid_info.sort_values('Likes', ascending = False).head(10)
    chart2 = sns.barplot(x = top_10_likes['Likes'], y = top_10_likes['Title'], color = 'red')
    chart2.set(title = 'Top 10 Videos by Likes')
    plt.savefig('chart2.png',  bbox_inches='tight')
    plt.close()
    #plotting it 
    pdf.image('chart2.png', x = 0, w = WIDTH)
    pdf.ln(5)

In [None]:
def comm_graph(pdf, vid_info = vid_info):
    #creating graph
    top_10_comments = vid_info.sort_values('Comments', ascending = False).head(10).reset_index(drop = True)
    chart3 = sns.barplot(x = top_10_comments['Comments'], y = top_10_comments['Title'], color = 'red')
    chart3.set(title = 'Top 10 Videos by Comments')
    plt.savefig('chart3.png',  bbox_inches='tight')
    plt.close()
    plt.close('fig3')
    #plotting it 
    pdf.image('chart3.png', x = 0, y = 225, w = 210)

In [364]:
def month_graph(pdf, vid_info = vid_info):
    # filling the gaps between months when videos weren't posted 

    #extracting year and month from date 
    vid_info['Year_Month'] = vid_info['Published_date'].dt.to_period('M')

    #how many videos were published every month 
    vid_month = vid_info.groupby(['Year_Month']).count()['Title'].reset_index()

    #creaing a month range from the first and last uploads  
    all_date = pd.date_range(vid_info['Published_date'].min(), vid_info['Published_date'].max(), freq='MS')

    #creaing a df and making date the same format as in the main df to join it 
    all_d = pd.DataFrame(all_date, columns = ['Year_Month'])
    all_d['Year_Month'] = all_d['Year_Month'].dt.to_period('M')

    # lef join of two dfs 
    freq = (pd.merge(all_d, vid_month, on = 'Year_Month', how = 'left'))[['Year_Month','Title']]

    #frequency of posting videos [how acttive was youtuber in the given period of time]

    #plotting the graph 
    x = np.arange(0,len(freq),1)
    fig, ax = plt.subplots(1,1)
    fig.set_figheight(5)
    fig.set_figwidth(20)

    #plotting bars 
    plt.bar(range(len(freq['Title'])), freq['Title'], align='center', color = 'red', width = 0.4)
    #plotting mean line 
    y_mean = [np.mean(freq['Title'].mean())]*len(freq['Year_Month'])
    ax.plot(range(len(freq['Title'])), y_mean, color = 'black', linestyle = '--', label = 'mean')

    #formating 
    ax.legend(loc='upper right')
    ax.set_xticks(x)
    ax.set_xticklabels(freq['Year_Month'])
    ax.set_ylabel('Numer of videos')
    ax.set_title('Number of videos that Youtuber posted every month')
    plt.xticks(rotation = 90)
    plt.savefig('chart4.png',  bbox_inches='tight')
    plt.close()
    
    pdf.image('chart4.png', 0, w = 210)
    pdf.ln(5)

In [365]:
#creating WordCloud of tags 
def tag_stat(pdf, vid_info = vid_info):
    #replacing all NaN with 0 
    vid_info['Tags'] = vid_info['Tags'].fillna(0)
    vid_info['Amount of tags'] = len(vid_info['Tags'])
    count_tags = vid_info['Amount of tags'].sum()
    avarage = count_tags/(vid_info['Title'].count())

    #getting a list of every tag from all videos 
    all_tags = vid_info['Tags'].tolist()
    tags = []
    count = 0 
    for element in all_tags:
        if element != 0:
            tags += element
            count += 1   #amount of vid with tags 
            
    pdf.set_font('Times', 'B', 12)
    pdf.write(5, 'Tags:')
    pdf.ln(6)
    pdf.set_font('Times', '', 12)
    pdf.write(5, 'Youtuber has used in total ' + str(count_tags) + ' tags for all videos. Avarage amount of tags (for videos with tags) is ' + str(avarage))

    #counting tags 
    values, counts = np.unique(tags, return_counts=True)
    dic_tags = dict(zip(values,counts))  #giving a dictionary with unique tags and their values 

    #creating wordcloud 
    wordcloud = WordCloud(width = 600, height = 300, colormap='Reds', background_color = 'white',collocations=False, 
                                          max_words = len(dic_tags)).generate_from_frequencies(dic_tags)
    
    #saving wordcloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig('chart5.png',  bbox_inches='tight')
    plt.close()
    #plotting 
    pdf.image('chart5.png', 10,90, h = 80,  w = 210/2+10) 
    
    #pie chart of tags/no tags 
    y = np.array([count, (len(all_tags)- count)])
    plt.pie(y, colors = ['tab:red','darkred'])
    plt.legend(['Tags','No Tags'], loc="lower right")
    plt.title('Proportion of Videos with/without Tags')
    plt.savefig('chart6.png',  bbox_inches='tight')
    plt.close()
    pdf.image('chart6.png', 130,95, h = 70,  w = 70) 

In [366]:
# creating pdf 
def create_report(filename = 'Youtube channel report.pdf'):
    pdf = FPDF() 
    pdf.add_page()
    
    #First page 
    title(pdf)
    main_stat(pdf)
    view_graph(pdf)
    like_graph(pdf)
    comm_graph(pdf)
    
    #Second page 
    pdf.add_page()
    month_graph(pdf)
    tag_stat(pdf)
    
    pdf.output(filename)

In [367]:
create_report()