In [None]:
import duckdb

import polars as pl
pl.Config.set_tbl_rows(30) #making sure it does not truncate rows

In [None]:
dataset_name = 'rentals'

In [None]:
file_path = f'../data/{dataset_name}/{dataset_name}.json'

In [None]:
con = duckdb.connect(database = ':memory:', read_only=False)

In [None]:
con.execute(f"""
CREATE OR REPLACE TABLE rentals AS 
SELECT * 
FROM '{file_path}'
""")

In [None]:
#Cannot make duckdb show everything for some reason. columns get truncated
con.sql('FROM rentals LIMIT 10').show(max_width=250)

In [None]:
#convert to polars df
rentals = con.sql('FROM rentals').pl()

In [None]:
#assumption made that rent is the only thing generating revenue.
#unable to clean everything. 

#Postal codes need a white space in order to match them with airbnb
display(rentals)

In [None]:
con.sql('SUMMARIZE rentals').show(max_width=250)

In [None]:
#due to time, focussing on rent string and postal code.
#Using a regex, lets keep only the number from the rent string, replacing all non digit characters
#To clean the time, we will place a whitespace in between the 4 letters and 2 number and then filter out the non valid ones.

query = """
CREATE OR REPLACE TABLE airbnb_valid_zipcodes AS

SELECT *
,substring(postalCode FROM 1 FOR 4) || ' ' || substring(postalCode FROM 5) AS postalCode_clean
,regexp_replace(rent, '[^0-9]', '', 'g') AS rent_clean
FROM rentals
WHERE 1=1
AND regexp_replace(rent, '[^0-9]', '', 'g') <> ''
AND postalCode_clean ~ '^[0-9]{4} [A-Z]{2}$'
"""

con.sql(query)


In [None]:
query = """
CREATE TABLE IF NOT EXISTS rentals_test (
    rent VARCHAR,
    postalCode VARCHAR
);

INSERT INTO rentals_test (rent, postalCode) VALUES
('€ 1,-', '1234AB'),
('€ 999,-', '5678CD'),
('€ 800,- Utilities incl.', '2345EF'),
('€ 1234,- Extra fee', '3456GH'),
('€ 50,- incl.', 'ABCD12'),  
('Invalid entry', '123ABCD');
"""

con.sql(query)

In [None]:
query = """
SELECT  substring(postalCode FROM 1 FOR 4) || ' ' || substring(postalCode FROM 5) AS postalCode_clean
,CAST(regexp_replace(rent, '[^0-9]', '', 'g') as int) AS rent_clean
FROM rentals_test
WHERE 1=1
AND regexp_replace(rent, '[^0-9]', '', 'g') <> ''
AND postalCode_clean ~ '^[0-9]{4} [A-Z]{2}$'
"""
con.sql(query)