In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('../../datasets/csv_datasets/listings.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39160 entries, 0 to 39159
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              39160 non-null  int64  
 1   name                            39160 non-null  object 
 2   host_id                         39160 non-null  int64  
 3   host_name                       39155 non-null  object 
 4   neighbourhood_group             39160 non-null  object 
 5   neighbourhood                   39160 non-null  object 
 6   latitude                        39160 non-null  float64
 7   longitude                       39160 non-null  float64
 8   room_type                       39160 non-null  object 
 9   price                           39160 non-null  int64  
 10  minimum_nights                  39160 non-null  int64  
 11  number_of_reviews               39160 non-null  int64  
 12  last_review                     

In [4]:
df.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,30,49,2022-06-21,0.29,3,365,0,
1,5121,Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,66,30,50,2019-12-02,0.28,2,120,0,
2,9357,Rental unit in New York · ★4.52 · Studio · 1 b...,30193,Tommi,Manhattan,Hell's Kitchen,40.76724,-73.98664,Entire home/apt,175,45,58,2017-08-13,0.34,1,340,0,


### 1 Rename and modify:
- 'room_type' column as 'bnb_type';
- 'availability_365' column as 'availability_rate_365', calculate avoilability rate;

In [5]:
df = df.rename({'room_type': 'bnb_type', 'availability_365': 'availability_rate_365'}, axis="columns")

In [6]:
df['availability_rate_365'] = round(df['availability_rate_365'] / 365, 2)

In [7]:
df.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,bnb_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_rate_365,number_of_reviews_ltm,license
0,2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,30,49,2022-06-21,0.29,3,1.0,0,


## 2. Drop column: 
- last_review;
- reviews_per_month;
- calculated_host_listings_count;
- number_of_reviews_ltm;

In [8]:
df = df.drop(labels=["last_review", "reviews_per_month", "calculated_host_listings_count", "number_of_reviews_ltm"], axis="columns")
df.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,bnb_type,price,minimum_nights,number_of_reviews,availability_rate_365,license
0,2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,30,49,1.0,


## X. GROUP BnB  
After an analysis i've checked that more bnb with same host_id, lon, lat, name have different price.

In [9]:
df.groupby(['host_id', 'latitude', 'longitude', 'name']).size().reset_index().iloc[:, :].sort_values(by=0).tail(30)

Unnamed: 0,host_id,latitude,longitude,name,0
21194,107434423,40.730657,-73.988651,Rental unit in New York · 1 bedroom · 1 bed · ...,6
21394,107434423,40.780085,-73.947499,Rental unit in New York · 1 bedroom · 1 bed · ...,6
21264,107434423,40.744891,-73.95684,Rental unit in Queens · 1 bedroom · 1 bed · 1 ...,6
21300,107434423,40.753011,-73.995934,Rental unit in New York · 1 bedroom · 1 bed · ...,6
21329,107434423,40.760436,-73.968523,Rental unit in New York · 2 bedrooms · 2 beds ...,6
24183,162280872,40.772427,-73.954552,Serviced apartment in New York · 1 bedroom · 1...,6
21322,107434423,40.759461,-73.97122,Rental unit in New York · ★New · 1 bedroom · 1...,6
20446,95459395,40.761532,-73.998779,Serviced apartment in New York · 1 bedroom · 1...,7
21108,107434423,40.7056,-74.00878,Rental unit in New York · 1 bedroom · 1 bed · ...,7
24176,162280872,40.770791,-73.957501,Serviced apartment in New York · 3 bedrooms · ...,7


## 3. Retrieve the columns:
- 'name'
- 'rate';
- 'room_type'
- 'n_beds'
- 'n_baths'
- 'is_shared_bath'

In [10]:
name = df["name"].str.extract(r'([\w\s]+) ·')
name.head()

Unnamed: 0,0
0,Rental unit in New York
1,Rental unit in Brooklyn
2,Rental unit in New York
3,Rental unit in Brooklyn
4,Rental unit in Brooklyn


In [11]:
rate = df["name"].str.extract(r'★([\d\w\.]{0,4}) ·')
rate.head()

Unnamed: 0,0
0,4.68
1,4.52
2,4.52
3,4.58
4,4.65


Some rows in 'name' column have a non-numeric rate or don't have a rate.

In [12]:
df.loc[rate[0] == "New",'name'] = df.loc[rate[0] == "New",'name'].str.replace('★[\w]{0,4}', "★0.00", regex=True)

In [13]:
df.loc[rate[0] == "New", 'name'].head(10)

3316     Home in Queens · ★0.00 · 1 bedroom · 1 bed · 1...
3412     Rental unit in New York · ★0.00 · 1 bedroom · ...
3763     Home in Brooklyn · ★0.00 · 5 bedrooms · 1 bed ...
8217     Rental unit in Queens · ★0.00 · 1 bedroom · 1 ...
8906     Condo in New York · ★0.00 · 1 bedroom · 2 beds...
11685    Rental unit in Brooklyn · ★0.00 · Studio · 1 b...
11821    Rental unit in New York · ★0.00 · 1 bedroom · ...
13294    Guesthouse in Brooklyn · ★0.00 · 1 bedroom · 2...
14479    Rental unit in Brooklyn · ★0.00 · Studio · 1 bath
14556    Rental unit in New York · ★0.00 · 2 bedrooms ·...
Name: name, dtype: object

Add '★0.00' in 'name' column of the dataframe where rate value is missing

In [14]:
name_list = [s.split("·", 2) for s in df['name']]
temp_list = name_list.copy()

for i,l in enumerate(name_list):
    s = np.where("★" in l[1], "·".join(l), "·".join( [l[0], " ★0.00 "] + l[1:] ) )
    temp_list[i] = str(s)

In [15]:
df['name'] = temp_list

Rate recalculation from clean dataframe

In [16]:
rate = df["name"].str.extract(r'★([\d\w\.]{0,4}) ·')
rate[0].value_counts()

0
0.00    17232
5.0      3971
4.67      804
4.75      681
4.83      611
        ...  
3.77        1
4.02        1
3.30        1
3.36        1
3.70        1
Name: count, Length: 163, dtype: int64

In [17]:
room_type = df["name"].str.extract(r'★[\d\.]{1,4} · ([\d\s\w]+)')
room_type[0] = room_type[0].str.rstrip()
room_type.head()

Unnamed: 0,0
0,Studio
1,1 bedroom
2,Studio
3,2 bedrooms
4,1 bedroom


In [18]:
room_type[0].value_counts()

0
1 bedroom      26013
2 bedrooms      6300
Studio          3281
3 bedrooms      2365
4 bedrooms       800
5 bedrooms       247
6 bedrooms        62
1 bed             34
7 bedrooms        19
9 bedrooms        11
2 beds            10
8 bedrooms         6
3 beds             3
10 bedrooms        2
11 bedrooms        2
14 bedrooms        1
21 bedrooms        1
26 bedrooms        1
2 baths            1
15 bedrooms        1
Name: count, dtype: int64

For all the bnb without a room type replace the type with '1 bedroom'

In [19]:
room_type[0] = room_type[0].replace({"2 baths": "1 bedroom", "1 bed":"1 bedroom", "2 beds":"1 bedroom", "3 beds":"1 bedroom"})

In [20]:
room_type[0].value_counts()

0
1 bedroom      26061
2 bedrooms      6300
Studio          3281
3 bedrooms      2365
4 bedrooms       800
5 bedrooms       247
6 bedrooms        62
7 bedrooms        19
9 bedrooms        11
8 bedrooms         6
10 bedrooms        2
11 bedrooms        2
14 bedrooms        1
21 bedrooms        1
26 bedrooms        1
15 bedrooms        1
Name: count, dtype: int64

In [21]:
n_beds = df['name'].str.extract(r'([\d\.]+) bed[s]?\s*')

For the rows without beds the default value is 0

In [22]:
n_beds.loc[n_beds[0].isna(), 0] = n_beds.loc[n_beds[0].isna(), 0].replace({np.nan: 0})

In [23]:
n_beds[0].value_counts()

0
1     28491
2      6908
3      2485
4       818
5       249
0       102
6        63
7        19
9        11
8         6
10        2
11        2
14        1
21        1
26        1
15        1
Name: count, dtype: int64

In [24]:
n_baths = df['name'].str.extract(r'([\d\.]*)[ shared| private]? [Half-]?bath[s]?[\s]*')

In [25]:
# df.loc[n_baths[0].isna(), 'name']
n_baths[0].value_counts()

0
1       20317
        13631
2        2907
1.5      1104
3         387
2.5       361
4         138
3.5        78
0          55
4.5        37
5          15
6           6
6.5         4
5.5         4
15.5        1
7           1
8.5         1
7.5         1
8           1
Name: count, dtype: int64

There are rows with only "bath", "shared Half-bath" or other stuff like this. In this case I consider 0.5 when there is "Half|half", 0 otherwise.

In [26]:
half_bath = df.loc[n_baths[0].isna(), 'name'].str.contains(r'[Shared |Private ]?[Half-|half-]?bath')
n_baths.loc[n_baths[0].isna(), 0] = np.where(half_bath, '0.5', '0')

In [27]:
n_baths[0].value_counts()

0
1       20317
        13631
2        2907
1.5      1104
3         387
2.5       361
4         138
0         100
3.5        78
0.5        66
4.5        37
5          15
6           6
6.5         4
5.5         4
15.5        1
7           1
8.5         1
7.5         1
8           1
Name: count, dtype: int64

In [28]:
is_bath_shared = np.where( df['name'].str.contains('shared|Shared', regex=True), True , False)
len(is_bath_shared[is_bath_shared == True])

10820

## 6. Update dataframe with new columns

In [29]:
df['name'] = name

In [30]:
df['rate'] = rate

In [31]:
df['room_type'] = room_type

In [32]:
df['n_beds'] = n_beds

In [33]:
df['n_baths'] = n_baths

In [34]:
df['is_bath_shared'] = is_bath_shared

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39160 entries, 0 to 39159
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     39160 non-null  int64  
 1   name                   39160 non-null  object 
 2   host_id                39160 non-null  int64  
 3   host_name              39155 non-null  object 
 4   neighbourhood_group    39160 non-null  object 
 5   neighbourhood          39160 non-null  object 
 6   latitude               39160 non-null  float64
 7   longitude              39160 non-null  float64
 8   bnb_type               39160 non-null  object 
 9   price                  39160 non-null  int64  
 10  minimum_nights         39160 non-null  int64  
 11  number_of_reviews      39160 non-null  int64  
 12  availability_rate_365  39160 non-null  float64
 13  license                3879 non-null   object 
 14  rate                   39160 non-null  object 
 15  ro

In [36]:
# df['license'] = np.where(df['license'].isna(), False, True)

In [37]:
# plt.pie(df['license'].value_counts(),labels=['Unlicensed', 'Licensed']);

## 5. Create dataframe for match sql table:
- Bnb_house
- Host
- Rent
- Rent_fare


In [38]:
Bnb_house = df[['id', 'name', 'availability_rate_365', 'rate', 'number_of_reviews', 'latitude', 'longitude', 'host_id']]
Bnb_house = Bnb_house.rename({'host_id': 'host'}, axis='columns')
Bnb_house.head(2)

Unnamed: 0,id,name,availability_rate_365,rate,number_of_reviews,latitude,longitude,host
0,2595,Rental unit in New York,1.0,4.68,49,40.75356,-73.98559,2845
1,5121,Rental unit in Brooklyn,0.33,4.52,50,40.68535,-73.95512,7356


Group host_id, host_name.

In [39]:
Host = df[['host_id', 'host_name']]
Host = Host.groupby('host_id').first().reset_index()
Host

Unnamed: 0,host_id,host_name
0,1678,Adam
1,2234,Mary
2,2438,Tasos
3,2571,Teedo
4,2787,John
...,...,...
23883,543670145,Anna Rachel
23884,543839038,Blair
23885,543856797,Antoinette
23886,544091750,Blanca


Rename host columns

In [40]:
Host = Host.rename(columns={
    'host_id': 'id',
    'host_name': 'name'
})

Drop unnamed hosts

In [41]:
Host = Host.drop(index=Host[Host['name'].isna()].index)

How to join the tables

In [42]:
Rent = df[['room_type', 'n_beds', 'n_baths', 'is_bath_shared', 'id']].copy()
Rent = Rent.rename(columns={'id': 'bnb_house'})
Rent['id'] = np.arange(Rent.shape[0])
Rent.head(2)

Unnamed: 0,room_type,n_beds,n_baths,is_bath_shared,bnb_house,id
0,Studio,1,1,False,2595,0
1,1 bedroom,1,0,False,5121,1


In [43]:
temp = pd.merge(Rent, df[['price', 'minimum_nights', 'id']], left_on='bnb_house', right_on='id').copy()
Rent_fare = temp[['price', 'minimum_nights', 'id_x']].copy()
Rent_fare = Rent_fare.rename(columns={'id_x': 'rent'})
Rent_fare['id'] = np.arange(Rent_fare.shape[0])
Rent_fare

Unnamed: 0,price,minimum_nights,rent,id
0,240,30,0,0
1,66,30,1,1
2,175,45,2,2
3,81,30,3,3
4,90,30,4,4
...,...,...,...,...
39155,64,1,39155,39155
39156,119,30,39156,39156
39157,266,31,39157,39157
39158,322,31,39158,39158


Reindex columns

In [44]:
Rent = Rent.reindex(columns=['id', 'room_type', 'n_beds', 'n_baths', 'is_bath_shared', 'bnb_house'])
Rent_fare = Rent_fare.reindex(columns=['id', 'price', 'minimum_nights', 'rent'])

In [45]:
Bnb_house.to_csv('../out/bnb_house.csv', index=False)
Host.to_csv('../out/host.csv', index=False)
Rent.to_csv('../out/rent.csv', index=False)
Rent_fare.to_csv('../out/rent_fare.csv', index=False)

In [47]:
Bnb_house

Unnamed: 0,id,name,availability_rate_365,rate,number_of_reviews,latitude,longitude,host
0,2595,Rental unit in New York,1.00,4.68,49,40.753560,-73.985590,2845
1,5121,Rental unit in Brooklyn,0.33,4.52,50,40.685350,-73.955120,7356
2,9357,Rental unit in New York,0.93,4.52,58,40.767240,-73.986640,30193
3,6848,Rental unit in Brooklyn,0.30,4.58,191,40.709350,-73.953420,15991
4,10452,Rental unit in Brooklyn,0.79,4.65,80,40.682940,-73.956820,35935
...,...,...,...,...,...,...,...,...
39155,1013632380355599044,Rental unit in New York,0.17,0.00,0,40.747829,-73.986907,401202937
39156,1014579690057145481,Rental unit in Brooklyn,0.71,0.00,0,40.689207,-73.936728,441448134
39157,1015027293549591335,Rental unit in New York,0.91,0.00,0,40.744972,-73.977042,107434423
39158,1015027813525407338,Rental unit in New York,1.00,0.00,0,40.740508,-73.978347,107434423


In [46]:
Rent

Unnamed: 0,id,room_type,n_beds,n_baths,is_bath_shared,bnb_house
0,0,Studio,1,1,False,2595
1,1,1 bedroom,1,0,False,5121
2,2,Studio,1,1,False,9357
3,3,2 bedrooms,2,1,False,6848
4,4,1 bedroom,1,,True,10452
...,...,...,...,...,...,...
39155,39155,1 bedroom,1,2,False,1013632380355599044
39156,39156,1 bedroom,1,1,False,1014579690057145481
39157,39157,1 bedroom,1,1,False,1015027293549591335
39158,39158,2 bedrooms,2,1,False,1015027813525407338
