In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sqlmodel import create_engine, SQLModel, Session
from dotenv import load_dotenv

load_dotenv()

db_host = os.environ.get("DB_HOST")
db_user = os.environ.get("DB_USER")
db_password = os.environ.get("DB_PASSWORD")
db_database = os.environ.get("DB_DATABASE")
db_port = os.environ.get("DB_PORT")


postgres_uri = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_database}"

engine = create_engine(postgres_uri, echo=False)

videosTable = "youtube_videos"
videoStatsTable = "youtube_video_stats"
channelsTable = "youtube_channels"
statsTable = "youtube_stats"

## Video scraping stats
- Number of videos at about page
- Number of videos scrapped
- Number of videos with stats


In [30]:
query = f'''
SELECT 
    {channelsTable}.*,
    MAX({statsTable}.videos) as about_videos,
    COUNT(DISTINCT {videosTable}.uuid) as n_videos,
    COUNT(DISTINCT {videoStatsTable}.uuid) as n_stats,
    COUNT(DISTINCT CASE WHEN {videoStatsTable}.uuid IS NULL THEN {videosTable}.uuid ELSE NULL END) as videos_no_stat,
    (MAX({statsTable}.videos) - COUNT(DISTINCT {videosTable}.uuid)) as not_found_videos
from {videosTable}
LEFT JOIN {channelsTable} ON {channelsTable}.uuid = {videosTable}.youtube_channel
LEFT JOIN {statsTable} ON {statsTable}.youtube_channel = {channelsTable}.uuid
LEFT JOIN {videoStatsTable} ON {videoStatsTable}.video = {videosTable}.uuid
GROUP BY
    {channelsTable}.uuid
'''
df = pd.read_sql(query, engine, index_col="uuid")
df

Unnamed: 0_level_0,username,about_videos,n_videos,n_stats,videos_no_stat,not_found_videos
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1775cf1c-7f48-434b-89b7-6e6fde8c2dfb,ProgramadorX,164.0,146,146,0,18.0
67c82473-cf02-45c2-8022-834ba436e1ff,CarlosAzaustre,405.0,232,124,136,173.0
6e306e97-463f-4ba2-8d64-fa0c39f3965f,CodelyTV,554.0,353,117,236,201.0
7b0dc0ff-8d42-4e5f-94c0-536bf4a9d428,AMPTech,92.0,79,85,0,13.0
83d7f202-88b1-4761-a32a-fab990e18227,mouredev,507.0,335,335,0,172.0
844f783d-93e0-4451-8ea2-30054b3742f5,HolaMundoDev,334.0,260,260,0,74.0
d97bb78c-1c14-4bb3-a921-728222390e1f,BettaTech,167.0,153,153,0,14.0


## Channel stats
- Subs
- Time from start
- Views / total subs
- View / total videos
- Video with more views
- Video with more likes
- Video with more comments
- Avg views
- Avg likes
- Avg comments
- Video with less views
- Video with less likes
- Video with less comments

In [31]:
videoStatsQuery = f'''
SELECT
    {videoStatsTable}.video,
    MAX({videoStatsTable}.views) as views,
    MAX({videoStatsTable}.likes) as likes,
    MAX({videoStatsTable}.comments) as comments
FROM {videoStatsTable}
WHERE {videoStatsTable}.timestamp = (
    SELECT MAX(timestamp) FROM {videoStatsTable} AS latest
    WHERE latest.video = {videoStatsTable}.video
)
GROUP BY {videoStatsTable}.video
'''


channelStatsQuery = f'''
SELECT
    {statsTable}.youtube_channel,
    MAX({statsTable}.subs) as subs,
    MAX({statsTable}.videos) as about_videos
FROM {statsTable}
WHERE {statsTable}.timestamp = (
    SELECT MAX(timestamp) FROM {statsTable} AS latest
    WHERE latest.youtube_channel = {statsTable}.youtube_channel
)
GROUP BY {statsTable}.youtube_channel
'''




query = f'''
SELECT 
    {channelsTable}.username,
    COUNT({videosTable}.uuid) as videos,
    AVG(channel_stats.about_videos) as about_videos,
    AVG(channel_stats.subs) as subs,
    SUM(latest_video_stats.views) as views,
    SUM(latest_video_stats.likes) as likes,
    SUM(latest_video_stats.comments) as comments,
    AVG(latest_video_stats.views) as avg_views,
    AVG(latest_video_stats.likes) as avg_likes,
    AVG(latest_video_stats.comments) as avg_comments,
    MIN(latest_video_stats.views) as min_views,
    MIN(latest_video_stats.likes) as min_likes,
    MIN(latest_video_stats.comments) as min_comments,
    (SUM(latest_video_stats.views)/AVG(channel_stats.subs)) as views_per_sub,
    (SUM(latest_video_stats.likes) / SUM(latest_video_stats.views)) as likes_per_views,
    (SUM(latest_video_stats.comments) / SUM(latest_video_stats.views)) as comments_per_views
FROM {videosTable}
    LEFT JOIN {channelsTable} ON {channelsTable}.uuid = {videosTable}.youtube_channel
    LEFT JOIN (
        {videoStatsQuery}
    ) latest_video_stats ON latest_video_stats.video = {videosTable}.uuid
    LEFT JOIN (
        {channelStatsQuery}
    ) channel_stats ON channel_stats.youtube_channel = {videosTable}.youtube_channel
GROUP BY {channelsTable}.username
'''


df = pd.read_sql(query, engine)
df

Unnamed: 0,username,videos,about_videos,subs,views,likes,comments,avg_views,avg_likes,avg_comments,min_views,min_likes,min_comments,views_per_sub,likes_per_views,comments_per_views
0,AMPTech,79,92.0,50200.0,2152875.0,71221.0,2135.0,27251.582278,901.531646,27.025316,0.0,0.0,0.0,42.885956,0.033082,0.000992
1,BettaTech,153,167.0,226000.0,9966405.0,421024.0,25671.0,65139.901961,2751.79085,167.784314,1835.0,0.0,0.0,44.099137,0.042244,0.002576
2,CarlosAzaustre,232,405.0,131000.0,1518028.0,82581.0,3231.0,15812.791667,860.21875,33.65625,0.0,55.0,0.0,11.588,0.0544,0.002128
3,CodelyTV,353,554.0,139000.0,1436005.0,70482.0,2294.0,12273.547009,602.410256,19.606838,0.0,29.0,0.0,10.330971,0.049082,0.001597
4,HolaMundoDev,260,334.0,672000.0,31902038.0,595327.0,90363.0,122700.146154,2289.719231,347.55,0.0,0.0,0.0,47.473271,0.018661,0.002833
5,mouredev,335,507.0,407000.0,14916776.0,370783.0,12647.0,44527.689552,1106.814925,37.752239,0.0,0.0,0.0,36.650555,0.024857,0.000848
6,ProgramadorX,146,164.0,291000.0,10140648.0,397490.0,21845.0,69456.493151,2722.534247,149.623288,0.0,0.0,0.0,34.847588,0.039198,0.002154


## Videos stats

In [34]:
query = f'''
SELECT
    {videosTable}.title,
    MAX({channelsTable}.username) as channel,
    MAX({videoStatsTable}.views) as views,
    MAX({videoStatsTable}.likes) as likes,
    MAX({videoStatsTable}.comments) as comments,
    MAX({videoStatsTable}.date) as date,
    CASE
        WHEN MAX({statsTable}.subs) > 0 THEN MAX({videoStatsTable}.views) / MAX({statsTable}.subs)
        ELSE 0.0 -- Handle division by zero by returning 0.0 (you can use NULL or another value as well)
    END as views_per_subs,
    CASE
        WHEN MAX({videoStatsTable}.views) > 0 THEN MAX({videoStatsTable}.likes) / MAX({videoStatsTable}.views)
        ELSE 0.0 -- Handle division by zero by returning 0.0 (you can use NULL or another value as well)
    END as likes_per_views,
    CASE
        WHEN MAX({videoStatsTable}.views) > 0 THEN MAX({videoStatsTable}.comments) / MAX({videoStatsTable}.views)
        ELSE 0.0 -- Handle division by zero by returning 0.0 (you can use NULL or another value as well)
    END as comments_per_views,
    MAX({statsTable}.subs) as subs
FROM {videosTable}
    LEFT JOIN {videoStatsTable} ON {videoStatsTable}.video = {videosTable}.uuid
    LEFT JOIN {statsTable} ON {statsTable}.youtube_channel = {videosTable}.youtube_channel
    LEFT JOIN {channelsTable} ON {channelsTable}.uuid = {videosTable}.youtube_channel
GROUP BY {videosTable}.title
'''
df = pd.read_sql(query, engine, index_col='title')
df

Unnamed: 0_level_0,channel,views,likes,comments,date,views_per_subs,likes_per_views,comments_per_views,subs
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
¿Por qué debes usar TYPESCRIPT inmediatamente? ⌨️,ProgramadorX,31848.0,2779.0,148.0,2021-03-26,0.109443,0.087258,0.004647,291000.0
Tensorflow: Cómo clasificar números escritos a mano,AMPTech,17000.0,560.0,0.0,,0.338645,0.032941,0.000000,50200.0
The Elephant in the Room - Greg Young interview at #BuildStuffES ☀️😎,CodelyTV,,,,,,0.000000,0.000000,139000.0
"Especial Black Friday! 😱 JavaScript, Python y React 90% off!",HolaMundoDev,11581.0,754.0,61.0,2022-11-25,0.017234,0.065107,0.005267,672000.0
Anti-patrones de test: 🎅 Obese test,CodelyTV,,,,,,0.000000,0.000000,139000.0
...,...,...,...,...,...,...,...,...,...
CURSO de SwiftUI | #5 - COMBINACIÓN de Vistas,mouredev,7200.0,371.0,0.0,,0.017690,0.051528,0.000000,407000.0
¿Qué hacen los PROGRAMADORES en NAVIDAD? | Advent of Code 2020 #1,BettaTech,16579.0,1814.0,119.0,2020-12-07,0.073358,0.109416,0.007178,226000.0
¿Amazon deja los MICROSERVICIOS?,BettaTech,51178.0,3025.0,120.0,2023-05-16,0.226451,0.059107,0.002345,226000.0
Mi consejo si quieres venir a trabajar a Nueva Zelanda,HolaMundoDev,91719.0,9338.0,834.0,2020-10-09,0.136487,0.101811,0.009093,672000.0


In [35]:
df.sort_values(by=["views_per_subs"], ascending=False).head(20)

Unnamed: 0_level_0,channel,views,likes,comments,date,views_per_subs,likes_per_views,comments_per_views,subs
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KOTLIN: Curso ANDROID desde CERO para PRINCIPIANTES,mouredev,2952652.0,0.0,1365.0,2020-03-23,7.254673,0.0,0.000462,407000.0
Lógica de Programación 👩‍💻 Aprende a programar en 10 minutos,ProgramadorX,1977477.0,0.0,2340.0,2021-03-12,6.795454,0.0,0.001183,291000.0
Qué necesitas para hacer Inteligencia Artificial / Aprendizaje automático,AMPTech,303000.0,0.0,0.0,,6.035857,0.0,0.0,50200.0
ANDROID STUDIO: COMO Crear una APP (para Principiantes) 📲 [Tutorial],mouredev,2300000.0,0.0,0.0,,5.651106,0.0,0.0,407000.0
"Especificaciones de una computadora (RAM, Memoria y Procesador)",AMPTech,264804.0,0.0,421.0,2017-04-26,5.27498,0.0,0.00159,50200.0
💥 MEJOR PLUGIN VSCODE - Autocompletado con MACHINE LEARNING - AI,CarlosAzaustre,588000.0,2490.0,0.0,,4.48855,0.004235,0.0,131000.0
Curso de PYTHON desde CERO para PRINCIPIANTES,mouredev,1100000.0,0.0,0.0,,2.702703,0.0,0.0,407000.0
Aprende HTML ahora! curso completo GRATIS desde cero,HolaMundoDev,1700000.0,0.0,0.0,,2.529762,0.0,0.0,672000.0
Los SECRETOS para APRENDER a PROGRAMAR RÁPIDO (y desde CERO),BettaTech,558891.0,0.0,524.0,2020-08-02,2.472969,0.0,0.000938,226000.0
(pt. 1/2) Cómo hacer un clasificador de imagenes desde cero con Tensorflow,AMPTech,122000.0,2496.0,0.0,,2.430279,0.020459,0.0,50200.0
