In [2]:
import pandas as pd
from sqlalchemy import create_engine
import time
from config import *
from utils import *

In [7]:
## Glopal  variables
months = config.MONTHS
postgres = config.POSTGRS_CREDENTIALS
data_cols = config.DATA_COLS
dims = config.DWH_DIMS_FACTS

In [12]:
def select_dim(df, dims, dim_name):
    df_dim = df[dims[dim_name]]
    return df_dim

def convert_to_neumeric(df,cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df[cols] = df[cols].fillna(0)
    return df
    
def convert_to_date(df,cols):
    default_date = pd.Timestamp('1900-01-01')
    for col in cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[col] = df[col].fillna(default_date)
    return df

def convert_to_string(df, cols):
    df[cols] = df[cols].astype(str).fillna("N/A")
    return df

In [8]:
df_total = pd.DataFrame(columns=data_cols)

for month in months:
    path = f"Data/{month}/listings (1).csv"
    df_source = pd.read_csv(path, low_memory=False)
    df_source = df_source[data_cols].dropna(how='all')
    df_total = pd.concat([df_total, df_source], axis=0, ignore_index=True)
    print(f">>>>>>> month {month} appended the dataframe")
    print(f"Now the dataframe shape is {df_total.shape}")

  df_total = pd.concat([df_total, df_source], axis=0, ignore_index=True)


>>>>>>> month jan appended the dataframe
Now the dataframe shape is (51221, 80)
>>>>>>> month March appended the dataframe
Now the dataframe shape is (96193, 80)
>>>>>>> month November appended the dataframe
Now the dataframe shape is (144409, 80)


In [13]:
df_total["price"] = df_total[["price"]].replace({'\$': '', ',': ''}, regex=True)
df_total = convert_to_neumeric(df_total, ["host_total_listings_count", "price", "accommodates", 
                                          "bathrooms", "bedrooms", "beds", "latitude",
                                          "longitude", "maximum_nights", "minimum_nights",
                                          "number_of_reviews","availability_30","availability_60",
                                          "availability_90","availability_365"])
df_total = convert_to_string(df_total, ["host_name", "host_url", "host_response_rate",
                                        "host_verifications", "host_location", "neighbourhood",
                                        "region_name", "region_parent_name", 
                                        "region_parent_parent_name", "property_type", 
                                        "room_type"])

df_total = convert_to_date(df_total, ["host_since"])

In [14]:
dim_host_df = select_dim(df_total, dims, "dim_host")
dim_host_df = dim_host_df.dropna(subset= ["host_id"])
dim_location_df = select_dim(df_total, dims, "dim_location")
dim_property_df = select_dim(df_total, dims, "dim_property")

In [16]:
engine = create_engine(f'postgresql://{postgres["USER"]}:{postgres["PASSWORD"]}@{postgres["HOST"]}:{postgres["PORT"]}/{postgres["db"]}')

In [40]:
dim_host_df.to_sql('dim_host', engine, if_exists='append', index=False)

408

In [18]:
dim_location_df.to_sql('dim_location', engine, if_exists='append', index=False)

409

In [19]:
dim_property_df.to_sql('dim_property', engine, if_exists='append', index=False)

409

In [30]:
df_total['host_since'] = pd.to_datetime(df_total['host_since'], errors='coerce')
dim_host_df.dtypes

host_id                              object
host_name                            object
host_url                             object
host_since                   datetime64[ns]
host_about                           object
host_response_rate                   object
host_total_listings_count            object
host_verifications                   object
host_identity_verified               object
accommodates                         object
bathrooms                            object
bedrooms                             object
beds                                 object
dtype: object

In [36]:
dim_host_df.dtypes

host_id                              object
host_name                            object
host_url                             object
host_since                   datetime64[ns]
host_about                           object
host_response_rate                   object
host_total_listings_count           float64
host_verifications                   object
host_identity_verified               object
accommodates                        float64
bathrooms                           float64
bedrooms                            float64
beds                                float64
dtype: object

In [None]:
dim_host_df.dtypes

In [None]:
dim_host_df[dim_host_df["host_id"].isnull()]

In [None]:
df_total = df_total[df_total["host_total_listings_count"]!="f"]

In [None]:
dim_host_df

In [None]:
df_total["price"]

In [None]:
df_total["price"] 

In [None]:
", ".join([col for col in df_total.columns if col.startswith("availability")])