# Connect and Import data from the postgres server

In [2]:
import pandas as pd
from sqlalchemy import create_engine

engine=create_engine("postgresql://postgres:bleach#postgres@localhost:5433/film")

table_query = '''
SELECT table_name FROM information_schema.tables
WHERE table_schema='public' AND table_type='BASE TABLE'
'''

table_names=pd.read_sql(table_query,engine)['table_name'].to_list()

dfs={}
for table in table_names:
    dfs[table]=pd.read_sql(f"SELECT * FROM public.{table}",engine)

payment = dfs["payment"]
film = dfs["film"]
actor = dfs["actor"]
address = dfs["address"]
category = dfs["category"]
city = dfs["city"]
country = dfs["country"]
customer = dfs["customer"]
film_actor = dfs["film_actor"]
film_category = dfs["film_category"]
inventory = dfs["inventory"]
language = dfs["language"]
rental = dfs["rental"]
staff = dfs["staff"]
store = dfs["store"]

# Filtering (like where clause in postgres)

In [3]:
payment.loc[payment['customer_id']==100].shape[0]

24

### for power bi dax we use exact for case sensitivity, in pandas and sql default is case sensitive

In [4]:
customer.loc[customer['first_name']=='erica']
customer.loc[customer['first_name']=='ERICA']

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
168,169,2,ERICA,MATTHEWS,ERICA.MATTHEWS@sakilacustomer.org,173,True,2020-02-14,2020-02-15 09:57:20+00:00,0


# logical operators

| **Operator** | **PostgreSQL** | **Pandas** | **DAX**                            |
| ------------ | -------------- | ---------- | ---------------------------------- |
| **AND**      | `AND`          | `&`        | `AND()` or `&&`                    |
| **OR**       | `OR`           | `pipe`     | `OR()` or doube pipe               |
| **NOT**      | `NOT`          | `~`        | `NOT()` or NOT                     |

pipe -> |
double pipe -> ||


# Between

| Parameter       | Description                                                                                                                                                                                 |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **`left`**      | The lower bound of the range. **Required**.                                                                                                                                                 |
| **`right`**     | The upper bound of the range. **Required**.                                                                                                                                                 |
| **`inclusive`** | Defines which endpoints are included:<br>• `'both'` (default): `left ≤ x ≤ right`<br>• `'neither'`: `left < x < right`<br>• `'left'`: `left ≤ x < right`<br>• `'right'`: `left < x ≤ right` |


In [5]:
payment.loc[payment['amount'].between(1,2,inclusive='both')]

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date
0,16050,269,2,7,1.99,2020-01-24 21:40:19.996577+00:00
6,16056,270,1,193,1.99,2020-01-26 05:10:14.996577+00:00
31,16081,282,2,48,1.99,2020-01-25 04:49:12.996577+00:00
53,16103,294,1,595,1.99,2020-01-28 12:28:20.996577+00:00
83,16133,307,1,614,1.99,2020-01-28 14:01:54.996577+00:00
...,...,...,...,...,...,...
15823,31873,263,1,6808,1.99,2020-04-12 15:05:08.996577+00:00
15827,31877,263,2,7833,1.99,2020-04-28 06:14:40.996577+00:00
15850,31900,265,2,7414,1.99,2020-04-27 14:14:33.996577+00:00
15866,31916,267,1,8572,1.99,2020-04-29 09:19:50.996577+00:00


### Time Zone conversion

In [6]:
payment['Europe_payment_date']=payment['payment_date'].dt.tz_convert('Europe/Berlin')
payment.loc[(payment['Europe_payment_date'].between('2020-01-26 00:00:00','2020-01-27 23:59:59',inclusive='both') & (payment['amount'].between(1.99,3.99,inclusive='both')))].shape[0]

104

DATE(2020, 1, 27) + TIME(23, 59, 0)  in dax

# isin()

| Parameter    | Description                                                                 |
| ------------ | --------------------------------------------------------------------------- |
| **`values`** | The list, set, Series, or iterable of values to check against. **Required** |


In [7]:
payment.loc[(payment['customer_id'].isin([12,25,67,93,124,234])) & (payment['amount'].isin([4.99,7.99,9.99])) & (payment['Europe_payment_date'].between('2020-01-01 00:00:00','2020-01-31 23:59:59'))].shape[0]

7

# LIKE and Regex

| Pattern | Regex   | Pandas                               | DAX                                      |
| ------- | ------- | ------------------------------------ | ---------------------------------------- |
| `'A%'`  | `^A.*`  | `df[df['col'].str.match(r'^A')]`     | `LEFT([col],1) = "A"`                    |
| `'%A'`  | `.*A$`  | `df[df['col'].str.match(r'.*A$')]`   | `RIGHT([col],1) = "A"`                   |
| `'%A%'` | `.*A.*` | `df[df['col'].str.contains(r'A')]`   | `SEARCH("A", [col], 1) > 0`              |
| `'_A%'` | `^.A.*` | `df[df['col'].str.match(r'^.A')]`    | `MID([col],2,1) = "A"`                   |
| `'%_A'` | `.*.A$` | `df[df['col'].str.match(r'.*.A$')]`  | `MID([col],LEN([col])-1,1) = "A"`        |
| `'_A_'` | `^.A.$` | `df[df['col'].str.match(r'^.A.$')]`  | `LEN([col]) = 3 && MID([col],2,1) = "A"` |


### [XY]-> X or Y 
### [^XY] -> not (X or Y)
### (er|ty) er or ty


In [12]:
film.loc[film['description'].str.match(r'.*Documentary.*'),'film_id'].size

101

In [17]:
customer.loc[(customer['first_name'].str.match(r'^.{3}$')) &(customer['last_name'].str.match(r'.*[XY]$'))].shape[0]

3

# startswith()

| Parameter | Type             | Default    | Description                              |
| --------- | ---------------- | ---------- | ---------------------------------------- |
| `pat`     | `str` or `tuple` | *Required* | The prefix string(s) to check for        |
| `na`      | `bool`, `np.nan` | `np.nan`   | Value to return for missing (NaN) values |



by default it give nan on the missing value but we can change it to true or false by adjusting na. if na-true then we say the nan has to identify a true 


In [26]:
customer['first_name'].str.startswith('A',na=True).iloc[0:4]

0    False
1    False
2    False
3    False
Name: first_name, dtype: bool

# endswith()

| Parameter | Type             | Default    | Description                              |
| --------- | ---------------- | ---------- | ---------------------------------------- |
| `pat`     | `str` or `tuple` | *Required* | The suffix string(s) to check for        |
| `na`      | `bool`, `np.nan` | `np.nan`   | Value to return for missing (NaN) values |

# contains()

| Parameter | Type            | Default    | Description                                        |
| --------- | --------------- | ---------- | -------------------------------------------------- |
| `pat`     | `str`           | *Required* | Substring or regex pattern to search for           |
| `case`    | `bool`          | `True`     | Whether match should be case-sensitive             |
| `flags`   | `int`           | `0`        | Regex flags (like `re.IGNORECASE`)                 |
| `na`      | `bool`/`np.nan` | `np.nan`   | Value to return if element is `NaN`                |
| `regex`   | `bool`          | `True`     | Whether to interpret `pat` as a regular expression |

df['col'].str.contains('apple', case=False, regex=True, na=False)


# match()

| Parameter | Type            | Default    | Description                                                     |
| --------- | --------------- | ---------- | --------------------------------------------------------------- |
| `pat`     | `str`           | *Required* | Full regex pattern (must match entire string from start to end) |
| `case`    | `bool`          | `True`     | Whether match should be case-sensitive                          |
| `flags`   | `int`           | `0`        | Regex flags (e.g., `re.IGNORECASE`)                             |
| `na`      | `bool`/`np.nan` | `np.nan`   | Value to return if element is `NaN`                             |

df['col'].str.match(r'^A.*z$', case=True, flags=re.IGNORECASE, na=False)


ignore case can override case 