In [1]:
# Import the logging module, which is used for tracking events that happen when code runs.
# Import the sys module, which provides access to some variables used or maintained by the Python interpreter.
import logging
import sys

# Configure the logging module to output log messages to stdout (standard output, usually the terminal).
# The level parameter sets the threshold for what messages will be logged. INFO means that all messages of level INFO and above will be logged.
# force=True ensures that the configuration is applied even if there are other handlers already configured.
logging.basicConfig(stream=sys.stdout, level=logging.INFO, force=True)

# Get the root logger and add a handler to it. This handler will also output log messages to stdout.
# This is done to ensure that log messages are displayed in the environment where the script is running.
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Import the Markdown class from the IPython.display module, which is used to display Markdown formatted text in Jupyter notebooks.
from IPython.display import Markdown, display

## Establish connection with PostgreSQL database

In [3]:
# Import the create_engine and text functions from the sqlalchemy module.
# create_engine is used to create a connection to the database.
# text is used to create SQL expressions.
from sqlalchemy import create_engine, text

# Define the database credentials and connection details.
db_user = "postgres"
db_password = "postgres"
db_host = "localhost"
db_port = "5432"
db_name = "youtube_data"

# Construct the connection string using the defined credentials and connection details.
# The format is specific to PostgreSQL databases.
connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create an engine instance using the connection string.
# The engine is the starting point for any SQLAlchemy application.
# It's "home base" for the actual database and its DBAPI, delivered to the SQLAlchemy application through a connection pool and a Dialect.
engine = create_engine(connection_string)

# Test the connection to the database by executing a raw SQL query.
# This is done within a context manager to ensure that the connection is properly closed after use.
with engine.connect() as connection:
    # Execute a SQL query to select the first 3 rows from the 'cities_chart' table.
    # The text function is used to create a SQL expression from the provided string.
    result = connection.execute(text("select * from cities_chart limit 3"))
    
    # Iterate over the result set, printing each row.
    for row in result:
        print(row)

(datetime.date(2020, 6, 28), '0x164b85cef5ab402d:0x8467b6b037a24d49', 'Addis Ababa', 0)
(datetime.date(2020, 6, 29), '0x164b85cef5ab402d:0x8467b6b037a24d49', 'Addis Ababa', 0)
(datetime.date(2020, 6, 30), '0x164b85cef5ab402d:0x8467b6b037a24d49', 'Addis Ababa', 0)


In [None]:
## Description of tables

In [4]:
# Descriging each table for accurate SQL
table_details = {
"cities_chart": "",
"cities_table": "",
"content_type_chart": "",
"content_type_table ": "",
"device_type_chart ": "",
"device_type_table ": "",
"geography_chart": "",
"geography_table": "",
"new_and_returning_viewers_chart": "",
"new_and_returning_viewers_table": "",
"operating_system_chart": "",
"operating_system_table": "",
"sharing_service_chart": "",
"sharing_service_table": "",
"subscription_source_chart": "",
"subscription_source_table": "",
"subscription_status_chart": "",
"subscription_status_table": "", 
"subtitles_and_cc_chart": "",
"subtitles_and_cc_table": "",
"traffic_source_chart": "",
"traffic_source_table": "",
"viewer_age_table": "",
"viewer_gender_table": "", 
"viewership_by_age_table": "",
"viewership_by_date_table": ""
}

In [None]:
## Listing all tables

In [8]:
# Importing the SQLDatabase class from the llama_index.core module
from llama_index.core import SQLDatabase

# Creating an instance of SQLDatabase with the specified engine and sample_rows_in_table_info parameter set to 2
# Note: The include_tables parameter is commented out, so only the tables specified in the tables list will be included
sql_database = SQLDatabase(engine, sample_rows_in_table_info=2)

# Listing all tables in the SQL database
list(sql_database._all_tables)

['traffic_source_table',
 'viewership_by_date_table',
 'traffic_source_chart',
 'content_type_chart',
 'cities_chart',
 'device_type_table',
 'viewership_by_age_table',
 'device_type_chart',
 'new_and_returning_viewers_table',
 'subscription_status_table',
 'sharing_service_chart',
 'cities_chart_data',
 'operating_system_table',
 'new_and_returning_viewers_chart',
 'subscription_status_chart',
 'subtitles_and_cc_table',
 'geography_table',
 'viewer_gender_table',
 'sharing_service_table',
 'cities_table',
 'geography_chart',
 'operating_system_chart',
 'subtitles_and_cc_chart',
 'subscription_source_chart',
 'viewer_age_table',
 'subscription_source_table',
 'content_type_table']

In [None]:
## Displaying metadata - each column in each table 

In [9]:
# Importing necessary modules from SQLAlchemy
from sqlalchemy import MetaData, create_engine

# Creating an engine instance for connecting to the PostgreSQL database
# Replace the placeholders with actual database credentials
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

# Creating a MetaData instance
metadata = MetaData()

# Reflecting the database schema to load table metadata
metadata.reflect(bind=engine)

# Iterating through each table in the database
for table_name, table in metadata.tables.items():
    # Printing the name of the current table
    print(f"Table Name: {table_name}")
    
    # Printing the names of the columns in the current table
    print(f"Columns: {table.columns.keys()}")

Table Name: cities_table
Columns: ['cities', 'cityname', 'geography', 'geography1', 'views', 'watchtimehours', 'averageviewduration']
Table Name: cities_chart
Columns: ['date', 'cities', 'cityname', 'views']
Table Name: viewership_by_age_table
Columns: ['date', 'views', 'watchtimehours', 'averageviewduration']
Table Name: content_type_table
Columns: ['Content type', 'Views', 'Watch time (hours)', 'Average view duration']
Table Name: content_type_chart
Columns: ['Date', 'Content type', 'Views']
Table Name: device_type_table
Columns: ['Device type', 'Views', 'Watch time (hours)', 'Average view duration']
Table Name: device_type_chart
Columns: ['Date', 'Device type', 'Views']
Table Name: geography_table
Columns: ['Geography', 'Views', 'Watch time (hours)', 'Average view duration']
Table Name: geography_chart
Columns: ['Date', 'Geography', 'Views']
Table Name: new_and_returning_viewers_table
Columns: ['New and returning viewers', 'Views', 'Watch time (hours)', 'Average view duration']
Tabl