# Data Types and Data Wrangling

Prof. Daniel de Abreu Pereira Uhr

### Conteúdo

* Data Types
* Data Wrangling



### Referências

* Géron, Aurélien. Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems. O'Reilly Media, 2019.
* Athey, S., & Imbens, G. (2017). "The State of Applied Econometrics: Causality and Policy Evaluation." *Journal of Economic Perspectives*.
* Mullainathan, S., & Spiess, J. (2017). "Machine Learning: An Applied Econometric Approach." *Journal of Economic Perspectives*.
* Varian, H. R. (2014). "Big Data: New Tricks for Econometrics." *Journal of Economic Perspectives*.

# Data Types

In [None]:
import numpy as np
import pandas as pd

In [24]:
# Import listings data
url_listings = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/visualisations/listings.csv"
df_listings = pd.read_csv(url_listings)

# Import pricing data
url_prices = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/data/calendar.csv.gz"
df_prices = pd.read_csv(url_prices, compression="gzip")

In [25]:
df_prices['maximum_nights'] - df_prices['minimum_nights']

0           148
1           357
2           357
3           357
4           357
           ... 
1260340    1124
1260341    1124
1260342    1124
1260343    1124
1260344    1124
Length: 1260345, dtype: int64

In [26]:
np.log(df_listings['price'])

0       4.219508
1       3.367296
2       3.912023
3       4.836282
4       3.912023
          ...   
3448    3.465736
3449    3.806662
3450    3.912023
3451    4.897840
3452    4.744932
Name: price, Length: 3453, dtype: float64

In [27]:
pd.cut(df_listings['price'], 
       bins = [0, 50, 100, np.inf], 
       labels=['cheap', 'ok', 'expensive'])

0              ok
1           cheap
2           cheap
3       expensive
4           cheap
          ...    
3448        cheap
3449        cheap
3450        cheap
3451    expensive
3452    expensive
Name: price, Length: 3453, dtype: category
Categories (3, object): ['cheap' < 'ok' < 'expensive']

In [28]:
df_listings['host_name'] + df_listings['neighbourhood']

0                     CarloSanto Stefano
1              EleonoraPorto - Saragozza
2                     PaoloSanto Stefano
3                Anna MariaSanto Stefano
4               ValerioPorto - Saragozza
                      ...               
3448                        IleanaNavile
3449           FernandaPorto - Saragozza
3450                        IleanaNavile
3451        Wonderful ItalySanto Stefano
3452    Wonderful ItalyPorto - Saragozza
Length: 3453, dtype: object

In [29]:
df_prices['price'].str.replace('$', '', regex=False)

0           70.00
1           68.00
2           68.00
3           68.00
4           68.00
            ...  
1260340    115.00
1260341    115.00
1260342    115.00
1260343    115.00
1260344    115.00
Name: price, Length: 1260345, dtype: object

In [30]:
df_listings['name'].str.contains('centre|center')

0        True
1       False
2        True
3       False
4       False
        ...  
3448    False
3449    False
3450    False
3451    False
3452    False
Name: name, Length: 3453, dtype: bool

In [31]:
df_prices['price'].str.replace('[$|,]', '', regex=True).astype(float)

0           70.0
1           68.0
2           68.0
3           68.0
4           68.0
           ...  
1260340    115.0
1260341    115.0
1260342    115.0
1260343    115.0
1260344    115.0
Name: price, Length: 1260345, dtype: float64

In [32]:
df_listings['id'].astype(str)

0          42196
1          46352
2          59697
3          85368
4         145779
          ...   
3448    53810648
3449    53820830
3450    53837098
3451    53837654
3452    53854962
Name: id, Length: 3453, dtype: object

In [33]:
pd.get_dummies(df_listings['neighbourhood']).head()

Unnamed: 0,Borgo Panigale - Reno,Navile,Porto - Saragozza,San Donato - San Vitale,Santo Stefano,Savena
0,0,0,0,0,1,0
1,0,0,1,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,1,0,0,0


In [34]:
df_prices['date'].dtypes

dtype('O')

In [35]:
df_prices['datetime'] = pd.to_datetime(df_prices['date'])

In [36]:
df_prices['datetime'].dtypes

dtype('<M8[ns]')

In [37]:
df_prices['datetime'].dt.year

0          2021
1          2021
2          2021
3          2021
4          2021
           ... 
1260340    2022
1260341    2022
1260342    2022
1260343    2022
1260344    2022
Name: datetime, Length: 1260345, dtype: int64

In [38]:
df_prices['datetime'].dt.to_period('M')

0          2021-12
1          2021-12
2          2021-12
3          2021-12
4          2021-12
            ...   
1260340    2022-12
1260341    2022-12
1260342    2022-12
1260343    2022-12
1260344    2022-12
Name: datetime, Length: 1260345, dtype: period[M]

In [39]:
df_prices['datetime'] -  pd.to_timedelta(3, unit='d')

0         2021-12-14
1         2021-12-14
2         2021-12-15
3         2021-12-16
4         2021-12-17
             ...    
1260340   2022-12-09
1260341   2022-12-10
1260342   2022-12-11
1260343   2022-12-12
1260344   2022-12-13
Name: datetime, Length: 1260345, dtype: datetime64[ns]

In [40]:
df_listings.isna().head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True


In [41]:
df_listings.isna().sum()

id                                   0
name                                 0
host_id                              0
host_name                            9
neighbourhood_group               3453
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                        409
reviews_per_month                  409
calculated_host_listings_count       0
availability_365                     0
number_of_reviews_ltm                0
license                           3318
dtype: int64

In [42]:
df_listings.dropna().shape

(0, 18)

In [43]:
df_listings.dropna(how='all').shape

(3453, 18)

In [44]:
df_listings.dropna(subset=['reviews_per_month']).shape

(3044, 18)

In [45]:
df_listings.fillna(' -- This was NA  -- ').head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,42196,50 sm Studio in the historic centre,184487,Carlo,-- This was NA --,Santo Stefano,44.48507,11.34786,Entire home/apt,68,3,180,2021-11-12,1.32,1,161,6,-- This was NA --
1,46352,A room in Pasolini's house,467810,Eleonora,-- This was NA --,Porto - Saragozza,44.49168,11.33514,Private room,29,1,300,2021-11-30,2.2,2,248,37,-- This was NA --
2,59697,COZY LARGE BEDROOM in the city center,286688,Paolo,-- This was NA --,Santo Stefano,44.48817,11.34124,Private room,50,1,240,2020-10-04,2.18,2,327,0,-- This was NA --
3,85368,Garden House Bologna,467675,Anna Maria,-- This was NA --,Santo Stefano,44.47834,11.35672,Entire home/apt,126,2,40,2019-11-03,0.34,1,332,0,-- This was NA --
4,145779,SINGLE ROOM,705535,Valerio,-- This was NA --,Porto - Saragozza,44.49306,11.33786,Private room,50,10,69,2021-12-05,0.55,9,365,5,-- This was NA --


In [46]:
df_listings.iloc[2, 2] = np.nan
df_listings.iloc[:3, :3]

Unnamed: 0,id,name,host_id
0,42196,50 sm Studio in the historic centre,184487.0
1,46352,A room in Pasolini's house,467810.0
2,59697,COZY LARGE BEDROOM in the city center,


# Data Wrangling

In [1]:
import numpy as np
import pandas as pd

In [2]:
url_listings = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/visualisations/listings.csv"
df_listings = pd.read_csv(url_listings)

In [3]:
url_prices = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/data/calendar.csv.gz"
df_prices = pd.read_csv(url_prices, compression="gzip")

In [4]:
df_listings.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3453 entries, 0 to 3452
Columns: 18 entries, id to license
dtypes: float64(4), int64(8), object(6)
memory usage: 485.7+ KB


In [5]:
df_listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,3,180,2021-11-12,1.32,1,161,6,
1,46352,A room in Pasolini's house,467810,Eleonora,,Porto - Saragozza,44.49168,11.33514,Private room,29,1,300,2021-11-30,2.2,2,248,37,
2,59697,COZY LARGE BEDROOM in the city center,286688,Paolo,,Santo Stefano,44.48817,11.34124,Private room,50,1,240,2020-10-04,2.18,2,327,0,
3,85368,Garden House Bologna,467675,Anna Maria,,Santo Stefano,44.47834,11.35672,Entire home/apt,126,2,40,2019-11-03,0.34,1,332,0,
4,145779,SINGLE ROOM,705535,Valerio,,Porto - Saragozza,44.49306,11.33786,Private room,50,10,69,2021-12-05,0.55,9,365,5,


In [6]:
df_listings.describe().T[:5]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,3453.0,29502180.0,15239880.0,42196.0,17485970.0,30787070.0,42200940.0,53854960.0
host_id,3453.0,123642400.0,116075600.0,38468.0,25500070.0,88454380.0,200592600.0,435431600.0
neighbourhood_group,0.0,,,,,,,
latitude,3453.0,44.49756,0.01173569,44.4236,44.49186,44.49699,44.50271,44.55093
longitude,3453.0,11.34509,0.01986071,11.232,11.33732,11.34519,11.35406,11.42027


In [7]:
df_listings.describe(include='all').T[:5]

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,3453.0,,,,29502177.118158,15239877.346777,42196.0,17485973.0,30787074.0,42200938.0,53854962.0
name,3453.0,3410.0,"Luxury Industrial Design LOFT, HEPA UV airpuri...",5.0,,,,,,,
host_id,3453.0,,,,123642405.854619,116075571.230048,38468.0,25500072.0,88454378.0,200592620.0,435431590.0
host_name,3444.0,747.0,Andrea,101.0,,,,,,,
neighbourhood_group,0.0,,,,,,,,,,


In [8]:
df_listings.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

In [9]:
df_listings.index

RangeIndex(start=0, stop=3453, step=1)

In [10]:
df_listings['price']

0        68
1        29
2        50
3       126
4        50
       ... 
3448     32
3449     45
3450     50
3451    134
3452    115
Name: price, Length: 3453, dtype: int64

In [11]:
df_listings.iloc[:7, 5:9]

Unnamed: 0,neighbourhood,latitude,longitude,room_type
0,Santo Stefano,44.48507,11.34786,Entire home/apt
1,Porto - Saragozza,44.49168,11.33514,Private room
2,Santo Stefano,44.48817,11.34124,Private room
3,Santo Stefano,44.47834,11.35672,Entire home/apt
4,Porto - Saragozza,44.49306,11.33786,Private room
5,Navile,44.51628,11.33074,Private room
6,Santo Stefano,44.48787,11.35392,Entire home/apt


In [12]:
df_listings.iloc[:, 5:9].head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type
0,Santo Stefano,44.48507,11.34786,Entire home/apt
1,Porto - Saragozza,44.49168,11.33514,Private room
2,Santo Stefano,44.48817,11.34124,Private room
3,Santo Stefano,44.47834,11.35672,Entire home/apt
4,Porto - Saragozza,44.49306,11.33786,Private room


In [13]:
df_listings.loc[:, ['neighbourhood', 'latitude', 'longitude']].head()

Unnamed: 0,neighbourhood,latitude,longitude
0,Santo Stefano,44.48507,11.34786
1,Porto - Saragozza,44.49168,11.33514
2,Santo Stefano,44.48817,11.34124
3,Santo Stefano,44.47834,11.35672
4,Porto - Saragozza,44.49306,11.33786


In [14]:
df_listings.loc[:, 'neighbourhood':'room_type'].head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type
0,Santo Stefano,44.48507,11.34786,Entire home/apt
1,Porto - Saragozza,44.49168,11.33514,Private room
2,Santo Stefano,44.48817,11.34124,Private room
3,Santo Stefano,44.47834,11.35672,Entire home/apt
4,Porto - Saragozza,44.49306,11.33786,Private room


In [15]:
df_listings.select_dtypes(include=['number']).head()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
0,42196,184487,,44.48507,11.34786,68,3,180,1.32,1,161,6
1,46352,467810,,44.49168,11.33514,29,1,300,2.2,2,248,37
2,59697,286688,,44.48817,11.34124,50,1,240,2.18,2,327,0
3,85368,467675,,44.47834,11.35672,126,2,40,0.34,1,332,0
4,145779,705535,,44.49306,11.33786,50,10,69,0.55,9,365,5


In [16]:
df_listings.loc[df_listings['number_of_reviews']>500, :].head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
52,884148,APOSA FLAT / CITY CENTER - BO,4664996,Vie D'Acqua Di Sandra Maria,,Santo Stefano,44.49945,11.34566,Entire home/apt,46,1,668,2021-12-11,6.24,5,252,20,
92,1435627,heart of Bologna Piazza Maggiore,7714013,Carlotta,,Porto - Saragozza,44.49321,11.33569,Entire home/apt,56,2,508,2021-12-12,5.08,1,131,69,
98,1566003,"""i portici di via Piella """,8325248,Massimo,,Santo Stefano,44.49855,11.34411,Entire home/apt,51,2,764,2021-12-14,7.62,3,119,120,
131,2282623,"S.Orsola zone,parking for free and self check-in",11658074,Cecilia,,San Donato - San Vitale,44.49328,11.3665,Entire home/apt,38,1,689,2021-10-24,7.2,1,5,72,
175,3216486,Stanza Privata,16289536,Fabio,,Navile,44.50903,11.342,Private room,82,1,569,2021-12-05,6.93,1,7,5,


In [17]:
df_listings.loc[(df_listings['number_of_reviews']>300) &
                (df_listings['reviews_per_month']>7), 
                :].head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
98,1566003,"""i portici di via Piella """,8325248,Massimo,,Santo Stefano,44.49855,11.34411,Entire home/apt,51,2,764,2021-12-14,7.62,3,119,120,
131,2282623,"S.Orsola zone,parking for free and self check-in",11658074,Cecilia,,San Donato - San Vitale,44.49328,11.3665,Entire home/apt,38,1,689,2021-10-24,7.2,1,5,72,
204,4166793,Centralissimo a Bologna,8325248,Massimo,,Santo Stefano,44.50092,11.34456,Entire home/apt,71,2,750,2021-12-10,9.21,3,233,84,
751,15508481,Monolocale in zona fiera /centro,99632788,Walid,,Navile,44.514462,11.353731,Entire home/apt,64,1,475,2021-12-01,7.56,1,4,48,
773,15886516,Monolocale nel cuore del ghetto ebraico di Bol...,103024123,Catia,,Santo Stefano,44.49508,11.34722,Entire home/apt,58,1,428,2021-12-15,7.88,1,285,17,


In [18]:
df_listings['neighbourhood'].unique()

array(['Santo Stefano', 'Porto - Saragozza', 'Navile',
       'San Donato - San Vitale', 'Savena', 'Borgo Panigale - Reno'],
      dtype=object)

In [19]:
df_listings[['neighbourhood', 'room_type']].drop_duplicates()

Unnamed: 0,neighbourhood,room_type
0,Santo Stefano,Entire home/apt
1,Porto - Saragozza,Private room
2,Santo Stefano,Private room
5,Navile,Private room
7,Navile,Entire home/apt
8,Porto - Saragozza,Entire home/apt
19,San Donato - San Vitale,Private room
24,Savena,Private room
36,Borgo Panigale - Reno,Entire home/apt
41,San Donato - San Vitale,Entire home/apt


In [20]:
df_listings.groupby('neighbourhood')[['price', 'reviews_per_month']].mean()

Unnamed: 0_level_0,price,reviews_per_month
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Borgo Panigale - Reno,83.020548,0.983488
Navile,142.200993,1.156745
Porto - Saragozza,129.908312,1.340325
San Donato - San Vitale,91.618138,0.933011
Santo Stefano,119.441841,1.34481
Savena,69.626016,0.805888


In [21]:
df_listings.groupby('neighbourhood').agg({"reviews_per_month": ["mean"],
                                          "price": ["min", np.max]}).reset_index()

Unnamed: 0_level_0,neighbourhood,reviews_per_month,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,amax
0,Borgo Panigale - Reno,0.983488,9,1429
1,Navile,1.156745,14,5000
2,Porto - Saragozza,1.340325,7,9999
3,San Donato - San Vitale,0.933011,10,1600
4,Santo Stefano,1.34481,11,9999
5,Savena,0.805888,9,680


In [22]:
df_listings.groupby('neighbourhood').agg(mean_reviews=("reviews_per_month", "mean"),
                                         min_price=("price", "min"),
                                         max_price=("price", np.max)).reset_index()

Unnamed: 0,neighbourhood,mean_reviews,min_price,max_price
0,Borgo Panigale - Reno,0.983488,9,1429
1,Navile,1.156745,14,5000
2,Porto - Saragozza,1.340325,7,9999
3,San Donato - San Vitale,0.933011,10,1600
4,Santo Stefano,1.34481,11,9999
5,Savena,0.805888,9,680


In [23]:
df_listings.pivot_table(index='neighbourhood', columns='room_type', values='price', aggfunc='mean')

room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Borgo Panigale - Reno,96.700935,,45.487179,
Navile,172.14,1350.0,68.416107,28.0
Porto - Saragozza,148.410926,102.375,83.070234,16.5
San Donato - San Vitale,106.775,55.0,61.19403,59.0
Santo Stefano,129.99026,103.827586,80.734177,95.4
Savena,86.30137,,46.229167,22.5


## Data Wrangling

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Import listings data
url_listings = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/visualisations/listings.csv"
df_listings = pd.read_csv(url_listings)

# Import pricing data
url_prices = "http://data.insideairbnb.com/italy/emilia-romagna/bologna/2021-12-17/data/calendar.csv.gz"
df_prices = pd.read_csv(url_prices, compression="gzip")

In [3]:
df_listings.sort_values(by=['name', 'price'], 
                        ascending=[False, True], 
                        na_position='last').head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
2280,38601411,🏡Giardino di Annabella-relax in città-casa intera,240803020,Annabella,,Porto - Saragozza,44.49303,11.31986,Entire home/apt,90,2,53,2021-12-13,1.96,1,76,27,392901.0
2988,48177313,❤ Romantic Suite with SPA Bath ❤ 4starbologna.com,239491712,4 Star Bologna,,Santo Stefano,44.50271,11.34998,Entire home/apt,309,1,1,2021-03-14,0.11,14,344,1,
3302,52367336,✨House of Alchemy✨,140013413,Greta,,Porto - Saragozza,44.49072,11.3089,Entire home/apt,96,2,7,2021-11-28,3.18,1,88,7,
2039,34495335,♥ Romantic for Couple in Love ♥ | 4 Star Boutique,239491712,4 Star Bologna,,Santo Stefano,44.50368,11.34972,Entire home/apt,143,1,25,2021-08-20,0.79,14,262,6,
2964,47866124,♡Amazing Suite with Private SPA ♡ 4starbologna...,239491712,4 Star Bologna,,Santo Stefano,44.50381,11.34951,Entire home/apt,347,1,2,2021-10-17,0.72,14,337,2,


In [4]:
df_listings.rename(columns={'name': 'listing_name', 
                            'id': 'listing_id'}).head()

Unnamed: 0,listing_id,listing_name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,3,180,2021-11-12,1.32,1,161,6,
1,46352,A room in Pasolini's house,467810,Eleonora,,Porto - Saragozza,44.49168,11.33514,Private room,29,1,300,2021-11-30,2.2,2,248,37,
2,59697,COZY LARGE BEDROOM in the city center,286688,Paolo,,Santo Stefano,44.48817,11.34124,Private room,50,1,240,2020-10-04,2.18,2,327,0,
3,85368,Garden House Bologna,467675,Anna Maria,,Santo Stefano,44.47834,11.35672,Entire home/apt,126,2,40,2019-11-03,0.34,1,332,0,
4,145779,SINGLE ROOM,705535,Valerio,,Porto - Saragozza,44.49306,11.33786,Private room,50,10,69,2021-12-05,0.55,9,365,5,


In [5]:
pd.crosstab(df_listings['neighbourhood'], df_listings['room_type'])

room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Borgo Panigale - Reno,107,0,39,0
Navile,250,3,149,1
Porto - Saragozza,842,16,299,10
San Donato - San Vitale,280,1,134,4
Santo Stefano,924,29,237,5
Savena,73,0,48,2


In [6]:
df_listings.groupby('neighbourhood')[['price', 'reviews_per_month']].mean()

Unnamed: 0_level_0,price,reviews_per_month
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Borgo Panigale - Reno,83.020548,0.983488
Navile,142.200993,1.156745
Porto - Saragozza,129.908312,1.340325
San Donato - San Vitale,91.618138,0.933011
Santo Stefano,119.441841,1.34481
Savena,69.626016,0.805888


In [7]:
df_listings.groupby('neighbourhood').agg(mean_reviews=("reviews_per_month", "mean"),
                                         min_price=("price", "min"),
                                         max_price=("price", np.max)).reset_index()

Unnamed: 0,neighbourhood,mean_reviews,min_price,max_price
0,Borgo Panigale - Reno,0.983488,9,1429
1,Navile,1.156745,14,5000
2,Porto - Saragozza,1.340325,7,9999
3,San Donato - San Vitale,0.933011,10,1600
4,Santo Stefano,1.34481,11,9999
5,Savena,0.805888,9,680


In [8]:
df_listings.groupby('neighbourhood')[['price', 'reviews_per_month']].transform('mean').head()

Unnamed: 0,price,reviews_per_month
0,119.441841,1.34481
1,129.908312,1.340325
2,119.441841,1.34481
3,119.441841,1.34481
4,129.908312,1.340325


In [9]:
df_listings1 = df_listings[:2000]
np.shape(df_listings1)

(2000, 18)

In [10]:
df_listings2 = df_listings[1000:]
np.shape(df_listings2)

(2453, 18)

In [11]:
np.shape(
    pd.concat([df_listings1, df_listings2])
)

(4453, 18)

In [12]:
df_merged = pd.merge(df_listings, df_prices, left_on='id', right_on='listing_id', how='inner')
df_merged.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price_x,...,availability_365,number_of_reviews_ltm,license,listing_id,date,available,price_y,adjusted_price,minimum_nights_y,maximum_nights
0,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,...,161,6,,42196,2021-12-17,f,$68.00,$68.00,3,360
1,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,...,161,6,,42196,2021-12-18,f,$68.00,$68.00,3,360
2,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,...,161,6,,42196,2021-12-19,f,$68.00,$68.00,3,360
3,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,...,161,6,,42196,2021-12-20,f,$68.00,$68.00,3,360
4,42196,50 sm Studio in the historic centre,184487,Carlo,,Santo Stefano,44.48507,11.34786,Entire home/apt,68,...,161,6,,42196,2021-12-21,f,$68.00,$68.00,3,360


In [13]:
df_long = df_merged.groupby(['neighbourhood', 'date'])['price_x'].agg('mean').reset_index()
df_long.head()

Unnamed: 0,neighbourhood,date,price_x
0,Borgo Panigale - Reno,2021-12-17,83.020548
1,Borgo Panigale - Reno,2021-12-18,83.020548
2,Borgo Panigale - Reno,2021-12-19,83.020548
3,Borgo Panigale - Reno,2021-12-20,83.020548
4,Borgo Panigale - Reno,2021-12-21,83.020548


In [14]:
df_wide = pd.pivot(data=df_long, index='date', columns='neighbourhood').reset_index()
df_wide.head()

Unnamed: 0_level_0,date,price_x,price_x,price_x,price_x,price_x,price_x
neighbourhood,Unnamed: 1_level_1,Borgo Panigale - Reno,Navile,Porto - Saragozza,San Donato - San Vitale,Santo Stefano,Savena
0,2021-12-17,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
1,2021-12-18,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
2,2021-12-19,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
3,2021-12-20,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
4,2021-12-21,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016


In [15]:
pd.melt(df_wide, id_vars='date', value_name='price').head()

Unnamed: 0,date,None,neighbourhood,price
0,2021-12-17,price_x,Borgo Panigale - Reno,83.020548
1,2021-12-18,price_x,Borgo Panigale - Reno,83.020548
2,2021-12-19,price_x,Borgo Panigale - Reno,83.020548
3,2021-12-20,price_x,Borgo Panigale - Reno,83.020548
4,2021-12-21,price_x,Borgo Panigale - Reno,83.020548


In [16]:
df_wide2 = df_wide.copy()
df_wide2.columns = [''.join(col) for col in df_wide2.columns]
df_wide2.head()

Unnamed: 0,date,price_xBorgo Panigale - Reno,price_xNavile,price_xPorto - Saragozza,price_xSan Donato - San Vitale,price_xSanto Stefano,price_xSavena
0,2021-12-17,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
1,2021-12-18,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
2,2021-12-19,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
3,2021-12-20,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016
4,2021-12-21,83.020548,142.200993,129.908312,91.618138,119.441841,69.626016


In [17]:
pd.wide_to_long(df_wide2, stubnames='price_x', i='date', j='neighborhood', suffix='\D+').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,price_x
date,neighborhood,Unnamed: 2_level_1
2021-12-17,Borgo Panigale - Reno,83.020548
2021-12-18,Borgo Panigale - Reno,83.020548
2021-12-19,Borgo Panigale - Reno,83.020548
2021-12-20,Borgo Panigale - Reno,83.020548
2021-12-21,Borgo Panigale - Reno,83.020548


In [18]:
temp = df_prices.copy()
temp['price'] = temp['price'].str.replace('[$|,]', '', regex=True).astype(float)
temp['date'] = pd.to_datetime(temp['date']).dt.to_period('M')
temp = temp.groupby(['listing_id', 'date'])['price'].mean().reset_index()\
    .sort_values(by=['listing_id', 'date'], ascending=[False, True])
temp.head()

Unnamed: 0,listing_id,date,price
44876,53854962,2021-12,147.4
44877,53854962,2022-01,137.645161
44878,53854962,2022-02,124.642857
44879,53854962,2022-03,285.096774
44880,53854962,2022-04,115.0


In [19]:
temp['price1'] = temp['price'].shift(1)
temp.head(15)

Unnamed: 0,listing_id,date,price,price1
44876,53854962,2021-12,147.4,
44877,53854962,2022-01,137.645161,147.4
44878,53854962,2022-02,124.642857,137.645161
44879,53854962,2022-03,285.096774,124.642857
44880,53854962,2022-04,115.0,285.096774
44881,53854962,2022-05,115.0,115.0
44882,53854962,2022-06,115.0,115.0
44883,53854962,2022-07,115.0,115.0
44884,53854962,2022-08,115.0,115.0
44885,53854962,2022-09,115.0,115.0


In [20]:
temp['price1'] = temp.groupby('listing_id')['price'].shift(1)
temp.head(15)

Unnamed: 0,listing_id,date,price,price1
44876,53854962,2021-12,147.4,
44877,53854962,2022-01,137.645161,147.4
44878,53854962,2022-02,124.642857,137.645161
44879,53854962,2022-03,285.096774,124.642857
44880,53854962,2022-04,115.0,285.096774
44881,53854962,2022-05,115.0,115.0
44882,53854962,2022-06,115.0,115.0
44883,53854962,2022-07,115.0,115.0
44884,53854962,2022-08,115.0,115.0
44885,53854962,2022-09,115.0,115.0


In [21]:
temp['avg_cum_price'] = temp['price'].expanding().mean()
temp.head(15)

Unnamed: 0,listing_id,date,price,price1,avg_cum_price
44876,53854962,2021-12,147.4,,147.4
44877,53854962,2022-01,137.645161,147.4,142.522581
44878,53854962,2022-02,124.642857,137.645161,136.562673
44879,53854962,2022-03,285.096774,124.642857,173.696198
44880,53854962,2022-04,115.0,285.096774,161.956959
44881,53854962,2022-05,115.0,115.0,154.130799
44882,53854962,2022-06,115.0,115.0,148.540685
44883,53854962,2022-07,115.0,115.0,144.348099
44884,53854962,2022-08,115.0,115.0,141.087199
44885,53854962,2022-09,115.0,115.0,138.478479


In [22]:
temp.groupby('listing_id')['price'].expanding().mean().reset_index(level=0).head(15)

Unnamed: 0,listing_id,price
0,42196,68.0
1,42196,68.0
2,42196,68.0
3,42196,68.0
4,42196,68.0
5,42196,68.0
6,42196,68.0
7,42196,68.0
8,42196,68.0
9,42196,68.0


In [23]:
temp['avg3_price'] = temp['price'].rolling(3).mean()
temp.head(15)

Unnamed: 0,listing_id,date,price,price1,avg_cum_price,avg3_price
44876,53854962,2021-12,147.4,,147.4,
44877,53854962,2022-01,137.645161,147.4,142.522581,
44878,53854962,2022-02,124.642857,137.645161,136.562673,136.562673
44879,53854962,2022-03,285.096774,124.642857,173.696198,182.461598
44880,53854962,2022-04,115.0,285.096774,161.956959,174.91321
44881,53854962,2022-05,115.0,115.0,154.130799,171.698925
44882,53854962,2022-06,115.0,115.0,148.540685,115.0
44883,53854962,2022-07,115.0,115.0,144.348099,115.0
44884,53854962,2022-08,115.0,115.0,141.087199,115.0
44885,53854962,2022-09,115.0,115.0,138.478479,115.0
