## Import packages

In [5]:
from googleapiclient.discovery import build
import pandas as pd
import uuid
import sys
import os

## Add project path

In [6]:
# Add project directory to the path
sys.path.append('../../data_analysis_project')

## Data collection

### Download data

In [7]:
# Import required functions
from utils.fuctions import collect_youtube_video_data


# Define a YT API Key
youtube_api_key = "AIzaSyAfoJ94EZmxu-GdSrDfuH9lQoZigkGd7aY"

# Build the YouTube API client
youtube = build("youtube", "v3", developerKey=youtube_api_key)

# Youtube videos IDs (pre-selected) 
videos_ids = ["kZaucITWv00", "0osEeTQLk3Q", "DEbALmrsZs8"]

# Collect data
videos_data, comments_data = collect_youtube_video_data(videos_ids=videos_ids, youtube=youtube)

# Show data
comments_data.head(3)

Unnamed: 0,video_id,user_name,comment,n_likes,updated_at
0,kZaucITWv00,@AvelinaBautistadeJesús,Q bueno q quitaron la Esc. de tiempo completo ...,1,2024-10-08T02:21:17Z
1,kZaucITWv00,@MrJaguzman01,Para la botarga Galvez los precios no son etic...,1,2024-09-22T11:09:54Z
2,kZaucITWv00,@HnoDanielelamigodelosbom-px5is,Mentiras del debate,1,2024-09-03T05:32:21Z


### Save data into a db (SQlite)

In [8]:
# Import required functions
from utils.fuctions import save_to_db


# Create a new column with 4 digits-unique IDs for comments
comments_data['comment_id'] = [str(uuid.uuid1()) for _ in range(comments_data.shape[0])]

# Get project directory
project_dir = os.getcwd()

# Get/Create a data directory (all data will be saved here)
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir, exist_ok=True) # check if directory exists, if not, create it

# Load data into a DataBase
save_to_db(dataframe=videos_data, db_dir=data_dir, db_name="youtube_data_analysis.db", table_name="videos")
save_to_db(dataframe=comments_data, db_dir=data_dir, db_name="youtube_data_analysis.db", table_name="comments")

comments_data.head(3)

Unnamed: 0,video_id,user_name,comment,n_likes,updated_at,comment_id
0,kZaucITWv00,@AvelinaBautistadeJesús,Q bueno q quitaron la Esc. de tiempo completo ...,1,2024-10-08T02:21:17Z,c9f683d9-8a62-11ef-9538-38d57ad0d964
1,kZaucITWv00,@MrJaguzman01,Para la botarga Galvez los precios no son etic...,1,2024-09-22T11:09:54Z,c9f683da-8a62-11ef-9947-38d57ad0d964
2,kZaucITWv00,@HnoDanielelamigodelosbom-px5is,Mentiras del debate,1,2024-09-03T05:32:21Z,c9f683db-8a62-11ef-be5a-38d57ad0d964


### Results

In [9]:
# Import required functions
from utils.fuctions import make_sql_query


# Define db directory, where db is stored
db_dir = os.path.join(os.path.dirname(os.getcwd()), "data")

# Define a sql query
query = """
        SELECT comment_id, comment
        FROM comments
        LIMIT 3
        """

# Retrieve raw data from db
make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)

Unnamed: 0,comment_id,comment
0,86753b70-aea2-4c5f-9dc8-a10738d463e3,Q bueno q quitaron la Esc. de tiempo completo ...
1,6ce327ba-7204-4baa-911f-085f8d2d9d87,Para la botarga Galvez los precios no son etic...
2,d2f1d132-23f9-4582-83e8-3443d485f5db,Mentiras del debate


In [10]:
# Define a sql query
query = """
        SELECT COUNT(comment) AS Number_of_comments
        FROM comments
        WHERE video_id = "kZaucITWv00"
        """

# Retrieve raw data from db
make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)

Unnamed: 0,Number_of_comments
0,2979


## Data preprocessing

### Import data

In [11]:
# Query
query = """
        SELECT *
        FROM comments
        """

# Comments data
comments = make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)

### Preprocess text

In [12]:
# Import required functions
from utils.fuctions import preprocess_text, correct_datetime


# Make a copy of comments df
comments_preprocessed = comments.copy()[['comment_id']]  

 # Clean text
comments_preprocessed['comment_preprocessed'] = comments['comment'].apply(preprocess_text) 

# Correct datetime 
comments_preprocessed['updated_at_corrected'] =  comments['updated_at'].apply(correct_datetime)
 
# Load preprocessed data into a db
save_to_db(dataframe=comments_preprocessed, db_dir=db_dir, db_name="youtube_data_analysis.db", table_name="comments_preprocessed")

### Results

In [13]:
# Query
query = """
        SELECT *
        FROM comments_preprocessed
        LIMIT 3
        """

# Comments data
make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)

Unnamed: 0,comment_id,comment_preprocessed,updated_at_corrected
0,86753b70-aea2-4c5f-9dc8-a10738d463e3,que bueno que quitaron la escuela de tiempo co...,2024-10-08 02:21:17+00:00
1,6ce327ba-7204-4baa-911f-085f8d2d9d87,para la botarga galvez los precios no son etic...,2024-09-22 11:09:54+00:00
2,d2f1d132-23f9-4582-83e8-3443d485f5db,mentiras del debate,2024-09-03 05:32:21+00:00


In [18]:
query = """
        SELECT *
        FROM comments
        LIMIT 3
        """

# Comments data
comments = make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)
comments['comment'][0]

'Q bueno q quitaron la Esc. de tiempo completo 😢😊 <a href="https://www.youtube.com/watch?v=kZaucITWv00&amp;t=2203">36:43</a>'

In [19]:
# Query
query = """
        SELECT *
        FROM comments_preprocessed
        LIMIT 3
        """

# Comments data
comments = make_sql_query(db_dir=db_dir, db_name="youtube_data_analysis.db", query=query)
comments['comment_preprocessed'][0]

'que bueno que quitaron la escuela de tiempo completo'