### Necessary libraries

In [2]:
import pandas as pd
import geopy.distance
import numpy as np
import matplotlib.pyplot as plt

### Listings - Dataset

In [4]:
listings = pd.read_csv("data/listings.csv")

listings_cleaned = listings[["id", "name", "neighbourhood_cleansed", "latitude", "longitude", "property_type",
                             "room_type", "accommodates", "bedrooms", "price","number_of_reviews",
                             "review_scores_rating", "review_scores_location", "reviews_per_month"]]
listings_cleaned.head()



Unnamed: 0,id,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month
0,15883,b&b near Old Danube river,Donaustadt,48.24262,16.42767,Room in bed and breakfast,Hotel room,3,1.0,$110.00,14,4.71,4.71,0.15
1,38768,central cityapartement- wifi- nice neighbourhood,Leopoldstadt,48.21924,16.37831,Entire rental unit,Entire home/apt,5,1.0,$69.00,350,4.75,4.75,2.5
2,40625,"Near Palace Schönbrunn, Apt. 1",Rudolfsheim-Fnfhaus,48.18434,16.32701,Entire rental unit,Entire home/apt,6,2.0,$145.00,181,4.83,4.59,1.23
3,392757,VCA3 Palais Brambilla - studio with city views,Innere Stadt,48.21496,16.37161,Entire rental unit,Entire home/apt,2,1.0,$100.00,100,4.64,4.89,0.79
4,51287,little studio- next to citycenter- wifi- nice ...,Leopoldstadt,48.21778,16.37847,Entire rental unit,Entire home/apt,3,,$68.00,347,4.65,4.86,2.45


In [6]:
listings_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11797 entries, 0 to 11796
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      11797 non-null  int64  
 1   name                    11785 non-null  object 
 2   neighbourhood_cleansed  11797 non-null  object 
 3   latitude                11797 non-null  float64
 4   longitude               11797 non-null  float64
 5   property_type           11797 non-null  object 
 6   room_type               11797 non-null  object 
 7   accommodates            11797 non-null  int64  
 8   bedrooms                10648 non-null  float64
 9   price                   11797 non-null  object 
 10  number_of_reviews       11797 non-null  int64  
 11  review_scores_rating    9869 non-null   float64
 12  review_scores_location  9773 non-null   float64
 13  reviews_per_month       9869 non-null   float64
dtypes: float64(6), int64(3), object(5)
mem

Price should be numeric, therefore we will convert it to numeric form. First, lets have a look into it:


In [8]:
listings_cleaned[['price']].head(5)

Unnamed: 0,price
0,$110.00
1,$69.00
2,$145.00
3,$100.00
4,$68.00


Before converting to numeric, we will remove the currency and then do the conversion.

In [10]:
listings_cleaned['price'] = listings_cleaned['price'].str.extract('(\d+)', expand=False)
listings_cleaned[['price']] = listings_cleaned[['price']].apply(pd.to_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_cleaned['price'] = listings_cleaned['price'].str.extract('(\d+)', expand=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_cleaned[['price']] = listings_cleaned[['price']].apply(pd.to_numeric)


#### Handling missing values

As seen below there are some missing values in our dataset:

In [12]:
listings_cleaned.isna().sum()

id                           0
name                        12
neighbourhood_cleansed       0
latitude                     0
longitude                    0
property_type                0
room_type                    0
accommodates                 0
bedrooms                  1149
price                        0
number_of_reviews            0
review_scores_rating      1928
review_scores_location    2024
reviews_per_month         1928
dtype: int64

In [14]:
# looking into the missing values
listings_cleaned[listings_cleaned.isna().any(axis = 1)]

Unnamed: 0,id,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month
4,51287,little studio- next to citycenter- wifi- nice ...,Leopoldstadt,48.217780,16.378470,Entire rental unit,Entire home/apt,3,,68,347,4.65,4.86,2.45
18,109679,"Near Palace Schönbrunn, Apt. 4",Rudolfsheim-Fnfhaus,48.184670,16.327950,Entire rental unit,Entire home/apt,5,,85,125,4.86,4.61,0.98
23,114505,"Near Palace Schönbrunn, Apt. 5",Rudolfsheim-Fnfhaus,48.184450,16.327220,Entire rental unit,Entire home/apt,5,,85,120,4.87,4.71,0.87
24,431055,Enjoy the quietness of the modern & central st...,Mariahilf,48.191580,16.349990,Entire rental unit,Entire home/apt,2,,97,44,4.98,4.86,0.35
29,131628,"Holiday Apartment ""Modern Vienna""",Ottakring,48.215430,16.309390,Entire rental unit,Entire home/apt,4,1.0,61,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11755,46787401,Tinyhouse Rosenrot - Waldnähe in Grünruhelage ...,Penzing,48.244175,16.189232,Tiny home,Entire home/apt,4,1.0,110,0,,,
11766,39187511,Apartment Wien Schwechat Nähe Flughafen u. Wien,Simmering,48.139300,16.483380,Private room in rental unit,Private room,8,2.0,120,0,,,
11779,14194630,3 Monate Single-Wohnung zu vermieten,Liesing,48.139580,16.198310,Entire condo,Entire home/apt,2,1.0,430,0,,,
11782,49314011,Villa Marie for 3 persons,Penzing,48.210460,16.188020,Entire rental unit,Entire home/apt,3,1.0,15,0,,,


In [19]:
# trying sth out

listings_cleaned[(listings_cleaned['number_of_reviews']==0) & (listings_cleaned['review_scores_rating'].isna())]

Unnamed: 0,id,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month
29,131628,"Holiday Apartment ""Modern Vienna""",Ottakring,48.215430,16.309390,Entire rental unit,Entire home/apt,4,1.0,61,0,,,
41,482090,U3 Meiselmarkt - Schönbrunnnähe,Rudolfsheim-Fnfhaus,48.196900,16.317930,Private room in rental unit,Private room,2,1.0,40,0,,,
49,197822,Comfortable Apartement for 2 (#5),Hernals,48.223560,16.317810,Entire condo,Entire home/apt,2,1.0,78,0,,,
50,197827,Comfortable Apartment for 2 (#6),Hernals,48.223560,16.317810,Entire condo,Entire home/apt,2,1.0,86,0,,,
51,197831,Roomy Apartment for 4 persons (#7),Hernals,48.223560,16.317810,Entire condo,Entire home/apt,4,2.0,139,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11755,46787401,Tinyhouse Rosenrot - Waldnähe in Grünruhelage ...,Penzing,48.244175,16.189232,Tiny home,Entire home/apt,4,1.0,110,0,,,
11766,39187511,Apartment Wien Schwechat Nähe Flughafen u. Wien,Simmering,48.139300,16.483380,Private room in rental unit,Private room,8,2.0,120,0,,,
11779,14194630,3 Monate Single-Wohnung zu vermieten,Liesing,48.139580,16.198310,Entire condo,Entire home/apt,2,1.0,430,0,,,
11782,49314011,Villa Marie for 3 persons,Penzing,48.210460,16.188020,Entire rental unit,Entire home/apt,3,1.0,15,0,,,


In [38]:
withreviews = listings_cleaned[listings_cleaned['number_of_reviews']!=0]
withreviews[withreviews['review_scores_location'].isna()].head(5)

Unnamed: 0,id,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month
488,2174605,98m² Loft directly at Naschmarkt,Mariahilf,48.197962,16.361513,Entire rental unit,Entire home/apt,2,1.0,350,1,0.0,,0.01
754,4713352,APARTMENT NEAR HUNDERTWASSERHAUS,Landstra§e,48.20907,16.39401,Entire condo,Entire home/apt,4,2.0,109,1,0.0,,0.02
871,5102698,Wohnung im Herzen der Stadt!,Leopoldstadt,48.2222,16.38228,Entire rental unit,Entire home/apt,4,2.0,99,1,0.0,,0.01
954,5369769,Nice Flat next to Westbahnhof,Rudolfsheim-Fnfhaus,48.19973,16.33691,Entire rental unit,Entire home/apt,4,1.0,50,1,0.0,,0.01
978,5777398,Lovely apartment in the heart of Vi,Wieden,48.18969,16.36723,Entire rental unit,Entire home/apt,2,1.0,49,1,0.0,,0.01


### First glimpse into NAN values

1. All the missing values of column `review_scores_rating`, `reviews_per_month` are due to the `number_of_reviews` being 0. So I believe these values are MAR (Missing at Random).
2. Moreover, also 1928 out of 2024 missing value rows of `review_scores_location` are connected to the fact that the `number_of_reviews` is 0. For remaining 96,they are missing even though there are few reviews for the listing, maybe due to the fact that the guest did not rate location explictly.
3. For column `bedrooms`, I think there is no connection between this variable missing and other columns- MCAR.

### POI - Dataset

In [39]:
poi = pd.read_csv("data/SEHENSWUERDIGOGD.csv")

# separating shape column into two columns longitude and latitude

poi['SHAPE'] = poi['SHAPE'].apply(lambda x: str(x).split('(')[1]
                                                          .split(')')[0]
                                                         .split(' '))

poi['longitude'] = poi['SHAPE'].apply(lambda x: x[0])
poi['latitude'] = poi['SHAPE'].apply(lambda x: x[1])

#keeping only the variables we are interested in

poi_cleaned = poi[['OBJECTID', 'NAME', 'latitude', 'longitude']].copy()
poi_cleaned.rename(columns={'OBJECTID':'objectid', 'NAME':'name'}, inplace=True)
poi_cleaned.head()

Unnamed: 0,objectid,name,latitude,longitude
0,436728,Ankeruhr,48.21076046788063,16.373675865880358
1,436729,Friedhof St. Marx,48.18297374058932,16.40167022910915
2,436730,Kuffner-Sternwarte,48.21287160702207,16.291260246419725
3,436731,Künstlerhaus,48.20096490237274,16.371294199571505
4,436732,Parlament,48.20811457164843,16.35849358713904


In [40]:
poi_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   objectid   63 non-null     int64 
 1   name       63 non-null     object
 2   latitude   63 non-null     object
 3   longitude  63 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.1+ KB


In [45]:
# converting latitude and longitude to numeric
poi_cleaned[['latitude']] = poi_cleaned[['latitude']] .apply(pd.to_numeric)
poi_cleaned[['longitude']] = poi_cleaned[['longitude']] .apply(pd.to_numeric)

In [46]:
poi_cleaned.isna().sum()

objectid     0
name         0
latitude     0
longitude    0
dtype: int64

### Joining Datasets

In [48]:
joined_df = poi_cleaned.join(listings_cleaned, how="cross", lsuffix="_poi", rsuffix="_listing")
joined_df.head(5)

Unnamed: 0,objectid,name_poi,latitude_poi,longitude_poi,id,name_listing,neighbourhood_cleansed,latitude_listing,longitude_listing,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month
0,436728,Ankeruhr,48.21076,16.373676,15883,b&b near Old Danube river,Donaustadt,48.24262,16.42767,Room in bed and breakfast,Hotel room,3,1.0,110,14,4.71,4.71,0.15
1,436728,Ankeruhr,48.21076,16.373676,38768,central cityapartement- wifi- nice neighbourhood,Leopoldstadt,48.21924,16.37831,Entire rental unit,Entire home/apt,5,1.0,69,350,4.75,4.75,2.5
2,436728,Ankeruhr,48.21076,16.373676,40625,"Near Palace Schönbrunn, Apt. 1",Rudolfsheim-Fnfhaus,48.18434,16.32701,Entire rental unit,Entire home/apt,6,2.0,145,181,4.83,4.59,1.23
3,436728,Ankeruhr,48.21076,16.373676,392757,VCA3 Palais Brambilla - studio with city views,Innere Stadt,48.21496,16.37161,Entire rental unit,Entire home/apt,2,1.0,100,100,4.64,4.89,0.79
4,436728,Ankeruhr,48.21076,16.373676,51287,little studio- next to citycenter- wifi- nice ...,Leopoldstadt,48.21778,16.37847,Entire rental unit,Entire home/apt,3,,68,347,4.65,4.86,2.45


#### Computing distance  between each listing and each POI as euclidean distance

In [49]:
joined_df['distance'] = joined_df.apply(lambda x: geopy.distance.geodesic((x['longitude_poi'], x['latitude_poi']),
                                                (x['longitude_listing'], x['latitude_listing'])).km, axis=1)
joined_df.head()

Unnamed: 0,objectid,name_poi,latitude_poi,longitude_poi,id,name_listing,neighbourhood_cleansed,latitude_listing,longitude_listing,property_type,room_type,accommodates,bedrooms,price,number_of_reviews,review_scores_rating,review_scores_location,reviews_per_month,distance
0,436728,Ankeruhr,48.21076,16.373676,15883,b&b near Old Danube river,Donaustadt,48.24262,16.42767,Room in bed and breakfast,Hotel room,3,1.0,110,14,4.71,4.71,0.15,6.876339
1,436728,Ankeruhr,48.21076,16.373676,38768,central cityapartement- wifi- nice neighbourhood,Leopoldstadt,48.21924,16.37831,Entire rental unit,Entire home/apt,5,1.0,69,350,4.75,4.75,2.5,1.040969
2,436728,Ankeruhr,48.21076,16.373676,40625,"Near Palace Schönbrunn, Apt. 1",Rudolfsheim-Fnfhaus,48.18434,16.32701,Entire rental unit,Entire home/apt,6,2.0,145,181,4.83,4.59,1.23,5.88535
3,436728,Ankeruhr,48.21076,16.373676,392757,VCA3 Palais Brambilla - studio with city views,Innere Stadt,48.21496,16.37161,Entire rental unit,Entire home/apt,2,1.0,100,100,4.64,4.89,0.79,0.50354
4,436728,Ankeruhr,48.21076,16.373676,51287,little studio- next to citycenter- wifi- nice ...,Leopoldstadt,48.21778,16.37847,Entire rental unit,Entire home/apt,3,,68,347,4.65,4.86,2.45,0.918601


#### Rents

Source: https://www.immopreise.at/Wien/Wohnung/Miete
Handscrapped on 16/12/2022