In [30]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math
import pylab
import scipy.stats as stats
%matplotlib inline

In [31]:
# Loading calendar dataset
file_path = "Resources/calendar_with_zips.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,date,id,host_id,zip,price_y,adjusted_price
0,0,2022-09-07,2539,2787,11218,$299.00,$299.00
1,1,2022-09-08,2539,2787,11218,$299.00,$299.00
2,2,2022-09-09,2539,2787,11218,$299.00,$299.00
3,3,2022-09-10,2539,2787,11218,$299.00,$299.00
4,4,2022-09-11,2539,2787,11218,$299.00,$299.00


In [32]:
# Determine if there are any missing values in the data
df.count()

Unnamed: 0        14551462
date              14551462
id                14551462
host_id           14551462
zip               14551462
price_y           14551120
adjusted_price    14551120
dtype: int64

In [33]:
# Determine if there are any missing values in the data
df.isnull().sum()

Unnamed: 0          0
date                0
id                  0
host_id             0
zip                 0
price_y           342
adjusted_price    342
dtype: int64

In [34]:
# Drop the null rows
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,date,id,host_id,zip,price_y,adjusted_price
0,0,2022-09-07,2539,2787,11218,$299.00,$299.00
1,1,2022-09-08,2539,2787,11218,$299.00,$299.00
2,2,2022-09-09,2539,2787,11218,$299.00,$299.00
3,3,2022-09-10,2539,2787,11218,$299.00,$299.00
4,4,2022-09-11,2539,2787,11218,$299.00,$299.00
...,...,...,...,...,...,...,...
14551457,14551457,2023-09-03,48971505,46201,10282,$40.00,$40.00
14551458,14551458,2023-09-04,48971505,46201,10282,$40.00,$40.00
14551459,14551459,2023-09-05,48971505,46201,10282,$40.00,$40.00
14551460,14551460,2023-09-06,48971505,46201,10282,$40.00,$40.00


In [35]:
# Determine if there are any missing values in the data
df.isnull().sum()

Unnamed: 0        0
date              0
id                0
host_id           0
zip               0
price_y           0
adjusted_price    0
dtype: int64

In [36]:
# Fill in the empty rows with "0"
df=df.fillna(0)
df

Unnamed: 0.1,Unnamed: 0,date,id,host_id,zip,price_y,adjusted_price
0,0,2022-09-07,2539,2787,11218,$299.00,$299.00
1,1,2022-09-08,2539,2787,11218,$299.00,$299.00
2,2,2022-09-09,2539,2787,11218,$299.00,$299.00
3,3,2022-09-10,2539,2787,11218,$299.00,$299.00
4,4,2022-09-11,2539,2787,11218,$299.00,$299.00
...,...,...,...,...,...,...,...
14551457,14551457,2023-09-03,48971505,46201,10282,$40.00,$40.00
14551458,14551458,2023-09-04,48971505,46201,10282,$40.00,$40.00
14551459,14551459,2023-09-05,48971505,46201,10282,$40.00,$40.00
14551460,14551460,2023-09-06,48971505,46201,10282,$40.00,$40.00


In [37]:
# Determine data types for the DataFrame.
df.dtypes

Unnamed: 0         int64
date              object
id                 int64
host_id            int64
zip                int64
price_y           object
adjusted_price    object
dtype: object

In [38]:
# Convert "price_y" from object to int
df["price"] = df["price_y"].str.replace('[\$\,]', '', regex=True)
df["price"]= pd.to_numeric(df["price"])
df["price"]

0           299.0
1           299.0
2           299.0
3           299.0
4           299.0
            ...  
14551457     40.0
14551458     40.0
14551459     40.0
14551460     40.0
14551461     40.0
Name: price, Length: 14551120, dtype: float64

In [39]:
# Convert "adjust_price" from object to int
df["AdjustedPrice"] = df["adjusted_price"].str.replace('[\$\,]', '', regex=True)
df["AdjustedPrice"]= pd.to_numeric(df["AdjustedPrice"])
df["AdjustedPrice"]

0           299.0
1           299.0
2           299.0
3           299.0
4           299.0
            ...  
14551457     40.0
14551458     40.0
14551459     40.0
14551460     40.0
14551461     40.0
Name: AdjustedPrice, Length: 14551120, dtype: float64

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,id,host_id,zip,price_y,adjusted_price,price,AdjustedPrice
0,0,2022-09-07,2539,2787,11218,$299.00,$299.00,299.0,299.0
1,1,2022-09-08,2539,2787,11218,$299.00,$299.00,299.0,299.0
2,2,2022-09-09,2539,2787,11218,$299.00,$299.00,299.0,299.0
3,3,2022-09-10,2539,2787,11218,$299.00,$299.00,299.0,299.0
4,4,2022-09-11,2539,2787,11218,$299.00,$299.00,299.0,299.0


In [41]:
# Remove the price_y Column
df.drop(columns=["price_y"], inplace=True)
#Remove the adjusted_price Column
df.drop(columns=["adjusted_price"], inplace=True)

df.head()

Unnamed: 0.1,Unnamed: 0,date,id,host_id,zip,price,AdjustedPrice
0,0,2022-09-07,2539,2787,11218,299.0,299.0
1,1,2022-09-08,2539,2787,11218,299.0,299.0
2,2,2022-09-09,2539,2787,11218,299.0,299.0
3,3,2022-09-10,2539,2787,11218,299.0,299.0
4,4,2022-09-11,2539,2787,11218,299.0,299.0


In [42]:
# Saving cleaned data
file_path = "Resources/calendar_data_cleaned.csv"
df.to_csv(file_path, index=False)
