In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [3]:
df = pd.read_csv('../../datasets/csv_datasets/listings.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39160 entries, 0 to 39159
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              39160 non-null  int64  
 1   name                            39160 non-null  object 
 2   host_id                         39160 non-null  int64  
 3   host_name                       39155 non-null  object 
 4   neighbourhood_group             39160 non-null  object 
 5   neighbourhood                   39160 non-null  object 
 6   latitude                        39160 non-null  float64
 7   longitude                       39160 non-null  float64
 8   room_type                       39160 non-null  object 
 9   price                           39160 non-null  int64  
 10  minimum_nights                  39160 non-null  int64  
 11  number_of_reviews               39160 non-null  int64  
 12  last_review                     

In [5]:
df.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,30,49,2022-06-21,0.29,3,365,0,
1,5121,Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,66,30,50,2019-12-02,0.28,2,120,0,
2,9357,Rental unit in New York · ★4.52 · Studio · 1 b...,30193,Tommi,Manhattan,Hell's Kitchen,40.76724,-73.98664,Entire home/apt,175,45,58,2017-08-13,0.34,1,340,0,


### 1 Rename and modify:
- 'room_type' column as 'bnb_type';
- 'availability_365' column as 'availability_rate_365', calculate avoilability rate;

In [6]:
df = df.rename({'room_type': 'bnb_type'}, axis="columns")

In [8]:
df.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,bnb_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,30,49,2022-06-21,0.29,3,365,0,


## 3. Retrieve the columns:
- 'name'
- 'rate';
- 'room_type'
- 'n_beds'
- 'n_baths'
- 'is_shared_bath'

In [9]:
name = df["name"].str.extract(r'([\w\s]+) ·')
name.head()

Unnamed: 0,0
0,Rental unit in New York
1,Rental unit in Brooklyn
2,Rental unit in New York
3,Rental unit in Brooklyn
4,Rental unit in Brooklyn


In [10]:
rate = df["name"].str.extract(r'★([\d\w\.]{0,4}) ·')
rate.head()

Unnamed: 0,0
0,4.68
1,4.52
2,4.52
3,4.58
4,4.65


Some rows in 'name' column have a non-numeric rate or don't have a rate.

In [11]:
df.loc[rate[0] == "New",'name'] = df.loc[rate[0] == "New",'name'].str.replace('★[\w]{0,4}', "★0.00", regex=True)

In [12]:
df.loc[rate[0] == "New", 'name'].head(10)

3316     Home in Queens · ★0.00 · 1 bedroom · 1 bed · 1...
3412     Rental unit in New York · ★0.00 · 1 bedroom · ...
3763     Home in Brooklyn · ★0.00 · 5 bedrooms · 1 bed ...
8217     Rental unit in Queens · ★0.00 · 1 bedroom · 1 ...
8906     Condo in New York · ★0.00 · 1 bedroom · 2 beds...
11685    Rental unit in Brooklyn · ★0.00 · Studio · 1 b...
11821    Rental unit in New York · ★0.00 · 1 bedroom · ...
13294    Guesthouse in Brooklyn · ★0.00 · 1 bedroom · 2...
14479    Rental unit in Brooklyn · ★0.00 · Studio · 1 bath
14556    Rental unit in New York · ★0.00 · 2 bedrooms ·...
Name: name, dtype: object

Add '★0.00' in 'name' column of the dataframe where rate value is missing

In [13]:
name_list = [s.split("·", 2) for s in df['name']]
temp_list = name_list.copy()

for i,l in enumerate(name_list):
    s = np.where("★" in l[1], "·".join(l), "·".join( [l[0], " ★0.00 "] + l[1:] ) )
    temp_list[i] = str(s)

In [14]:
df['name'] = temp_list

Rate recalculation from clean dataframe

In [15]:
rate = df["name"].str.extract(r'★([\d\w\.]{0,4}) ·')
rate[0].value_counts()

0
0.00    17232
5.0      3971
4.67      804
4.75      681
4.83      611
        ...  
3.77        1
4.02        1
3.30        1
3.36        1
3.70        1
Name: count, Length: 163, dtype: int64

In [16]:
room_type = df["name"].str.extract(r'★[\d\.]{1,4} · ([\d\s\w]+)')
room_type[0] = room_type[0].str.rstrip()
room_type.head()

Unnamed: 0,0
0,Studio
1,1 bedroom
2,Studio
3,2 bedrooms
4,1 bedroom


In [17]:
room_type[0].value_counts()

0
1 bedroom      26013
2 bedrooms      6300
Studio          3281
3 bedrooms      2365
4 bedrooms       800
5 bedrooms       247
6 bedrooms        62
1 bed             34
7 bedrooms        19
9 bedrooms        11
2 beds            10
8 bedrooms         6
3 beds             3
10 bedrooms        2
11 bedrooms        2
14 bedrooms        1
21 bedrooms        1
26 bedrooms        1
2 baths            1
15 bedrooms        1
Name: count, dtype: int64

For all the bnb without a room type replace the type with '1 bedroom'

In [18]:
room_type[0] = room_type[0].replace({"2 baths": "1 bedroom", "1 bed":"1 bedroom", "2 beds":"1 bedroom", "3 beds":"1 bedroom"})

In [19]:
room_type[0].value_counts()

0
1 bedroom      26061
2 bedrooms      6300
Studio          3281
3 bedrooms      2365
4 bedrooms       800
5 bedrooms       247
6 bedrooms        62
7 bedrooms        19
9 bedrooms        11
8 bedrooms         6
10 bedrooms        2
11 bedrooms        2
14 bedrooms        1
21 bedrooms        1
26 bedrooms        1
15 bedrooms        1
Name: count, dtype: int64

In [20]:
n_beds = df['name'].str.extract(r'([\d\.]+) bed[s]?\s*')

For the rows without beds the default value is 0

In [21]:
n_beds.loc[n_beds[0].isna(), 0] = n_beds.loc[n_beds[0].isna(), 0].replace({np.nan: 0})

In [22]:
n_beds[0].value_counts()

0
1     28491
2      6908
3      2485
4       818
5       249
0       102
6        63
7        19
9        11
8         6
10        2
11        2
14        1
21        1
26        1
15        1
Name: count, dtype: int64

In [23]:
n_baths = df['name'].str.extract(r'([\d\.]*)[ shared| private]? [Half-]?bath[s]?[\s]*')

In [24]:
# df.loc[n_baths[0].isna(), 'name']
n_baths[0].value_counts()

0
1       20317
        13631
2        2907
1.5      1104
3         387
2.5       361
4         138
3.5        78
0          55
4.5        37
5          15
6           6
6.5         4
5.5         4
15.5        1
7           1
8.5         1
7.5         1
8           1
Name: count, dtype: int64

There are rows with only "bath", "shared Half-bath" or other stuff like this. In this case I consider 0.5 when there is "Half|half", 0 otherwise.

In [25]:
half_bath = df.loc[n_baths[0].isna(), 'name'].str.contains(r'[Shared |Private ]?[Half-|half-]?bath')
n_baths.loc[n_baths[0].isna(), 0] = np.where(half_bath, '0.5', '0')

In [26]:
n_baths[0].value_counts()

0
1       20317
        13631
2        2907
1.5      1104
3         387
2.5       361
4         138
0         100
3.5        78
0.5        66
4.5        37
5          15
6           6
6.5         4
5.5         4
15.5        1
7           1
8.5         1
7.5         1
8           1
Name: count, dtype: int64

In [27]:
is_bath_shared = np.where( df['name'].str.contains('shared|Shared', regex=True), True , False)
len(is_bath_shared[is_bath_shared == True])

10820

## 6. Update dataframe with new columns

In [28]:
df['name'] = name

In [29]:
df['rate'] = rate

In [30]:
df['room_type'] = room_type

In [31]:
df['n_beds'] = n_beds

In [32]:
df['n_baths'] = n_baths

In [33]:
df['is_bath_shared'] = is_bath_shared

In [34]:
# df['license'] = np.where(df['license'].isna(), False, True)

In [35]:
# plt.pie(df['license'].value_counts(),labels=['Unlicensed', 'Licensed']);

In [36]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,bnb_type,price,...,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rate,room_type,n_beds,n_baths,is_bath_shared
0,2595,Rental unit in New York,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240,...,0.29,3,365,0,,4.68,Studio,1,1.0,False
1,5121,Rental unit in Brooklyn,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,66,...,0.28,2,120,0,,4.52,1 bedroom,1,0.0,False
2,9357,Rental unit in New York,30193,Tommi,Manhattan,Hell's Kitchen,40.76724,-73.98664,Entire home/apt,175,...,0.34,1,340,0,,4.52,Studio,1,1.0,False
3,6848,Rental unit in Brooklyn,15991,Allen & Irina,Brooklyn,Williamsburg,40.70935,-73.95342,Entire home/apt,81,...,1.09,1,110,5,,4.58,2 bedrooms,2,1.0,False
4,10452,Rental unit in Brooklyn,35935,Angela,Brooklyn,Bedford-Stuyvesant,40.68294,-73.95682,Private room,90,...,0.49,4,290,2,,4.65,1 bedroom,1,,True


In [37]:
df.to_csv('../out/listings.csv', index=False)