In [3]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

load_dotenv()
USERNAME = os.getenv("PGUSERNAME")
PASSWORD = os.getenv("PGPASSWORD")
HOST = os.getenv("HOST")
PORT = os.getenv("PORT")
DATABASE = os.getenv("DATABASE")
URL = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}"

ENGINE = create_engine(URL)

In [23]:
import pandas as pd

In [2]:
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [19]:
import pymongo
USERNAME = "reader"
PASSWORD = "readerpass"
HOST = "localhost"
PORT = 27017               
DATABASE = "mflix"


connection_string = f"mongodb://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}"


client = pymongo.MongoClient(connection_string)
db = client[DATABASE]

In [20]:
print(db.list_collection_names())

['sessions', 'movies', 'theaters', 'comments', 'users']


In [17]:
client.close()

In [25]:
pipeline = [
    {
        "$group": {
            "_id": "$year",
            "film_count": {"$sum": 1}
        }
    },
    {
        "$sort": {"_id": 1}
    }
]

answer = db.movies.aggregate(pipeline)
print(pd.DataFrame(answer))

       _id  film_count
0     1891           1
1     1893           1
2     1894           1
3     1896           2
4     1903           1
..     ...         ...
130  2010è           4
131  2011è           2
132  2012è           3
133  2014è           2
134  2015è           1

[135 rows x 2 columns]


In [41]:
for collection in db.list_collection_names():
     print(db[collection].find_one())

{'_id': ObjectId('5a9427648b0beebeb69579cc'), 'name': 'Andrea Le', 'email': 'andrea_le@fakegmail.com', 'movie_id': ObjectId('573a1390f29313caabcd418c'), 'text': 'Rem officiis eaque repellendus amet eos doloribus. Porro dolor voluptatum voluptates neque culpa molestias. Voluptate unde nulla temporibus ullam.', 'date': datetime.datetime(2012, 3, 26, 23, 20, 16)}
{'_id': ObjectId('573a1390f29313caabcd4135'), 'plot': 'Three men hammer on an anvil and pass a bottle of beer around.', 'genres': ['Short'], 'runtime': 1, 'cast': ['Charles Kayser', 'John Ott'], 'num_mflix_comments': 1, 'title': 'Blacksmith Scene', 'fullplot': 'A stationary camera looks at a large anvil with a blacksmith behind it and one on either side. The smith in the middle draws a heated metal rod from the fire, places it on the anvil, and all three begin a rhythmic hammering. After several blows, the metal goes back in the fire. One smith pulls out a bottle of beer, and they each take a swig. Then, out comes the glowing met

# Fetching constraints of database

In [4]:
query = """
SELECT 
    conname AS constraint_name,
    contype AS constraint_type,
    conrelid::regclass AS table_name,
    CASE 
        WHEN contype IN ('f', 'c') THEN confrelid::regclass::text
        ELSE NULL
    END AS referenced_table,
    conkey AS constraint_columns,
    confkey AS referenced_columns
FROM 
    pg_constraint
WHERE 
    conrelid::regclass::text NOT LIKE 'pg_%'
ORDER BY 
    conrelid::regclass::text, conname
"""

with ENGINE.connect() as conn:
    constraints = pd.DataFrame(conn.execute(text(query)))

constraints

Unnamed: 0,constraint_name,constraint_type,table_name,referenced_table,constraint_columns,referenced_columns
0,cardinal_number_domain_check,c,-,-,,
1,year_check,c,-,-,,
2,yes_or_no_check,c,-,-,,
3,actor_pkey,p,actor,,[1],
4,address_pkey,p,address,,[1],
5,fk_address_city,f,address,city,[5],[1]
6,category_pkey,p,category,,[1],
7,city_pkey,p,city,,[1],
8,fk_city,f,city,country,[3],[1]
9,country_pkey,p,country,,[1],


# Fetching column list from each table

In [5]:
query = """
SELECT
    c.table_schema,
    c.table_name,
    string_agg(c.column_name || ' (' || c.data_type || ')', ', ' ORDER BY c.ordinal_position) AS column_list
FROM
    information_schema.columns AS c
JOIN
    information_schema.tables AS t
    ON c.table_name = t.table_name
    AND c.table_schema = t.table_schema
WHERE
    t.table_type = 'BASE TABLE'
    AND t.table_schema NOT IN ('information_schema', 'pg_catalog')
GROUP BY
    c.table_schema,
    c.table_name
ORDER BY
    c.table_schema,
    c.table_name;
"""

with ENGINE.connect() as conn:
    columns = pd.DataFrame(conn.execute(text(query)))

columns[['table_name', 'column_list']].to_csv("../metadata/tables.txt", index=False, sep="\t", header=False)
columns

Unnamed: 0,table_schema,table_name,column_list
0,public,actor,"actor_id (integer), first_name (character vary..."
1,public,address,"address_id (integer), address (character varyi..."
2,public,category,"category_id (integer), name (character varying..."
3,public,city,"city_id (integer), city (character varying), c..."
4,public,country,"country_id (integer), country (character varyi..."
5,public,customer,"customer_id (integer), store_id (smallint), fi..."
6,public,film,"film_id (integer), title (character varying), ..."
7,public,film_actor,"actor_id (smallint), film_id (smallint), last_..."
8,public,film_category,"film_id (smallint), category_id (smallint), la..."
9,public,inventory,"inventory_id (integer), film_id (smallint), st..."


# Saving constraint of database

In [6]:
with open("../metadata/constraints.txt", "a") as f:
    for col in columns['table_name']:
        consts = constraints[constraints['table_name'] == col]
        for c in consts.iterrows():
            if c[1]['constraint_type'] == 'p':
                table_name = c[1]['table_name']
                column_list = columns[columns['table_name'] == table_name]['column_list'].values[0].split(', ')
                if len(c[1]['constraint_columns']) == 1:
                    constrained_index = int(c[1]['constraint_columns'][0]-1)
                else:
                    continue
                column_name = column_list[constrained_index]
                f.write(f"Column {column_name} is a primary key of the table {table_name}\n")
            elif c[1]['constraint_type'] == 'f':

                table_name = c[1]['table_name']
                column_list = columns[columns['table_name'] == table_name]['column_list'].values[0].split(', ')
                if len(c[1]['constraint_columns']) == 1:
                    constrained_index = int(c[1]['constraint_columns'][0]-1)
                else:
                    continue
                column_name = column_list[constrained_index]
                referenced_table = c[1]['referenced_table']
                referenced_columns = c[1]['referenced_columns']
                referenced_column_list = columns[columns['table_name'] == referenced_table]['column_list'].values[0].split(', ')
                referenced_column_name = referenced_column_list[referenced_columns[0]-1]
                f.write(f"Column {column_name} is a foreign key of the table {table_name} and references column {referenced_column_name} of the table {referenced_table}\n")


# Executing PowerShell script

In [7]:
import subprocess

script_path = '../scripts/postgres.ps1'

result = subprocess.run(['powershell.exe', '-ExecutionPolicy', 'Unrestricted', '-File', script_path], capture_output=True, text=True)

print(result.stderr)

The argument '../scripts/postgres.ps1' to the -File parameter does not exist. Provide the path to an existing '.ps1' file as an argument to the -File parameter.



# Clearing `metadata.sql`

In [8]:
with open("../metadata/metadata.sql", "r") as f:
    metadata = f.read()
    print(metadata.strip())

CREATE TABLE public.customer (
    customer_id integer DEFAULT nextval('public.customer_customer_id_seq'::regclass) NOT NULL,
    store_id smallint NOT NULL,
    first_name character varying(45) NOT NULL,
    last_name character varying(45) NOT NULL,
    email character varying(50),
    address_id smallint NOT NULL,
    activebool boolean DEFAULT true NOT NULL,
    create_date date DEFAULT ('now'::text)::date NOT NULL,
    last_update timestamp without time zone DEFAULT now(),
    active integer
);
CREATE TABLE public.actor (
    actor_id integer DEFAULT nextval('public.actor_actor_id_seq'::regclass) NOT NULL,
    first_name character varying(45) NOT NULL,
    last_name character varying(45) NOT NULL,
    last_update timestamp without time zone DEFAULT now() NOT NULL
);
CREATE TABLE public.category (
    category_id integer DEFAULT nextval('public.category_category_id_seq'::regclass) NOT NULL,
    name character varying(25) NOT NULL,
    last_update timestamp without time zone DEFAULT 

In [9]:
import re

def extract_create_table_statements(file_path, output_path):
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Wyrażenie regularne do znalezienia poleceń CREATE TABLE
    pattern = re.compile(r'CREATE TABLE[\s\S]*?;', re.MULTILINE)
    matches = pattern.findall(content)

    with open(output_path, 'w') as file:
        for match in matches:
            file.write(match + '\n')

# Wykonaj funkcję
extract_create_table_statements('../metadata/metadata.sql', '../metadata/metadata.sql')
