## Importing modules

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
    
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from textblob import TextBlob

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# show all columns
pd.set_option('display.max_columns', None)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

## Importing data


In [7]:
#listings = pd.read_csv('./Airbnb/listings.csv')
#listings_clean = pd.read_csv('./Airbnb/listings_clean.csv')
#listings_new_2 = pd.read_csv('./Airbnb/listings_new_2.csv')
listings_new_2_distr = pd.read_csv('./Airbnb/listings_new_2_distr.csv')
listings_new_2_distr = listings_new_2_distr.drop(columns=['Unnamed: 0'])

In [None]:
#listings_details = pd.read_csv('./Airbnb/listings_details.csv')
listings_details_clean = pd.read_csv('./Airbnb/listings_details_clean.csv')

In [3]:
#reviews_details = pd.read_csv('./Airbnb/reviews_details.csv')
#reviews_details = pd.read_csv('./Airbnb/reviews_details_clean.csv')
reviews_details = pd.read_csv('./Airbnb/reviews_details_pol.csv')

In [5]:
#calendar = pd.read_csv('./Airbnb/calendar.csv')
calendar_clean = pd.read_csv('./Airbnb/calendar_clean.csv')

In [14]:
districts = pd.read_csv('./Airbnb/neighbourhoods.csv')

In [4]:
reviews_details.shape

(431830, 8)

# Data cleaning

#### Calendar data

In [39]:
# Remove rows where apartments are not available

calendar_clean = calendar.dropna()
calendar_clean.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [40]:
# Remove the dollar sign from 'price' column and conver to float
calendar_clean['price'] = calendar_clean['price'].replace({'\$': '', ',': ''}, regex=True)
calendar_clean['price'] = calendar_clean['price'].astype(float)

# Change the 'date' column to datetime
calendar_clean['date'] = pd.to_datetime(calendar_clean['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
#calendar_clean.to_csv('calendar_clean.csv')

#### Listings

In [29]:
listings.isnull().sum()

id                                    0
name                                 38
host_id                               0
host_name                             4
neighbourhood_group               20030
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                        2406
reviews_per_month                  2406
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [27]:
# drop ['neighbourhood_group', 'last_review', 'host_name', 'calculated_host_listings_count']
listings_clean = listings.drop(columns=['neighbourhood_group', 'last_review', 'host_name', 
                                        'calculated_host_listings_count', 'reviews_per_month', 'room_type',
                                       'minimum_nights', 'number_of_reviews'])

In [28]:
#listings_clean.to_csv('listings_clean.csv')

#### Listings (details)

In [42]:
listings_details_clean = listings_details.drop(columns=['space','summary','listing_url', 'scrape_id', 'last_scraped', 'name', 
                                               'experiences_offered', 'neighborhood_overview', 'notes',
                                               'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 
                                               'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 
                                               'host_acceptance_rate', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
                                               'street', 'neighbourhood', 'neighbourhood_group_cleansed', 'city', 'state', 'market', 
                                               'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'square_feet', 
                                               'price', 'calendar_last_scraped', 'has_availability', 'jurisdiction_names', 'requires_license', 
                                               'license', 'is_business_travel_ready', 'zipcode', 'weekly_price', 'monthly_price',
                                                'host_about', 'host_location'])
listings_details_clean

Unnamed: 0,id,description,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2818,Quiet Garden View Room & Super Fast WiFi I'm r...,2008-09-24,within an hour,100%,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,Oostelijk Havengebied - Indische Buurt,f,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",$100.00,$50.00,1,$20.00,3,15,today,17,44,44,44,248,2009-03-30,2018-11-28,97.0,10.0,10.0,10.0,10.0,9.0,10.0,t,strict_14_with_grace_period,f,f,1,2.10
1,3209,You will love our spacious (90 m2) bright apar...,2008-10-24,within an hour,100%,f,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Westerpark,t,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",$300.00,$40.00,2,$15.00,4,20,7 weeks ago,0,0,0,47,42,2015-07-31,2018-08-29,96.0,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,f,f,1,1.03
2,20168,"Cozy studio on your own private floor, 100% in...",2009-12-02,within a few hours,100%,f,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,Centrum-Oost,t,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",,,2,$0.00,1,1000,today,0,7,24,198,233,2010-03-02,2018-11-30,87.0,9.0,10.0,9.0,9.0,10.0,9.0,f,strict_14_with_grace_period,f,f,2,2.18
3,25428,"This nicely furnished, newly renovated apt is...",2009-11-20,within a few hours,100%,f,2.0,2.0,"['email', 'phone', 'reviews']",t,f,Centrum-West,f,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",$300.00,$40.00,2,$10.00,14,60,2 days ago,2,32,44,141,1,2018-01-21,2018-01-21,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,f,f,2,0.09
4,27886,Stylish and romantic houseboat on fantastic hi...,2010-03-23,within an hour,100%,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Centrum-West,t,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",$0.00,$0.00,1,$0.00,2,730,today,16,37,54,199,171,2012-01-09,2018-11-25,99.0,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,f,f,1,2.03
5,28658,2 beds guest room in Amsterdam West near Erasm...,2010-05-12,within an hour,100%,f,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",t,t,Bos en Lommer,t,Apartment,Private room,2,1.0,1.0,2.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Paid parking off...",$150.00,$15.00,2,$0.00,3,15,2 months ago,11,29,52,295,434,2010-05-16,2018-11-19,93.0,9.0,10.0,10.0,9.0,9.0,9.0,f,moderate,t,t,2,4.16
6,28871,In a monumental house right in the center of A...,2010-05-13,within an hour,100%,t,3.0,3.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Centrum-West,t,Apartment,Private room,2,,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Pets live on thi...",,,1,$0.00,2,1825,yesterday,3,10,17,137,215,2010-08-22,2018-12-03,97.0,10.0,10.0,10.0,10.0,10.0,10.0,f,moderate,f,f,3,2.13
7,29051,because of the city imposing a 4 paying guest ...,2010-05-13,within an hour,100%,t,3.0,3.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Centrum-West,t,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Smoking allowed""...",,,1,$0.00,2,730,today,1,10,18,188,383,2011-03-16,2018-12-05,95.0,10.0,10.0,10.0,10.0,10.0,10.0,f,moderate,f,f,3,4.07
8,31080,My apartment is light and cosy. The three bed...,2010-05-27,a few days or more,0%,f,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",t,t,Zuid,f,Apartment,Entire home/apt,4,1.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",,$60.00,4,$30.00,3,365,16 months ago,1,31,61,336,32,2011-08-06,2017-10-16,95.0,9.0,10.0,10.0,10.0,9.0,9.0,f,moderate,f,f,1,0.36
9,41125,"A nice, sunny and spacious apartment in the ce...",2010-07-23,within a few hours,100%,f,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",t,t,Centrum-West,f,Apartment,Entire home/apt,2,0.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Paid par...",$150.00,$40.00,2,$75.00,3,21,today,8,8,11,11,76,2010-11-25,2018-10-07,95.0,10.0,9.0,10.0,10.0,10.0,9.0,f,moderate,f,f,1,0.78


In [45]:
listings_details_clean['cancellation_policy'].isnull().sum()

0

In [47]:
listings_details_clean.host_response_rate.unique()

array(['100%', '0%', '91%', nan, '80%', '67%', '70%', '78%', '90%', '92%',
       '50%', '95%', '71%', '40%', '75%', '33%', '93%', '96%', '98%',
       '83%', '82%', '43%', '94%', '86%', '88%', '10%', '60%', '32%',
       '25%', '46%', '76%', '57%', '45%', '73%', '63%', '97%', '89%',
       '20%', '56%', '44%', '30%', '29%', '99%', '69%', '87%', '65%',
       '17%', '77%', '27%', '38%', '62%', '58%', '74%', '47%', '55%',
       '85%', '53%', '79%', '66%'], dtype=object)

In [48]:
listings_details_clean.isnull().sum()

id                                     0
description                          124
host_since                             4
host_response_time                  9483
host_response_rate                  9483
host_is_superhost                      4
host_listings_count                    4
host_total_listings_count              4
host_verifications                     0
host_has_profile_pic                   4
host_identity_verified                 4
neighbourhood_cleansed                 0
is_location_exact                      0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                             10
bedrooms                               8
beds                                   7
bed_type                               0
amenities                              0
security_deposit                    6166
cleaning_fee                        3629
guests_included                        0
extra_people    

In [43]:
#listings_details_clean.to_csv('listings_details_clean.csv')

#### Merge 'listings_clean' with 'listings_details_clean'

In [61]:
listings_clean = listings_clean.drop(columns=['Unnamed: 0', 'availability_365'])
listings_details_clean = listings_details_clean.drop(columns=['Unnamed: 0'])

In [62]:
# Merge 'listings_clean' with 'listings_details_clean' (on='id')

listings_new = pd.merge(listings_clean, listings_details_clean, on='id')

In [63]:
listings_new = pd.merge(listings_clean, listings_details_clean, on='id')

In [65]:
#listings_new.to_csv('listings_new.csv')

#### Clean 'listings_new'

In [253]:
listings_new.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,price,description,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2818,Quiet Garden View Room & Super Fast WiFi,3159,Oostelijk Havengebied - Indische Buurt,52.365755,4.941419,59,Quiet Garden View Room & Super Fast WiFi I'm r...,2008-09-24,within an hour,100%,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,Oostelijk Havengebied - Indische Buurt,f,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",$100.00,$50.00,1,$20.00,3,15,today,17,44,44,44,248,2009-03-30,2018-11-28,97.0,10.0,10.0,10.0,10.0,9.0,10.0,t,strict_14_with_grace_period,f,f,1,2.1
1,3209,"Quiet apt near center, great view",3806,Westerpark,52.390225,4.873924,160,You will love our spacious (90 m2) bright apar...,2008-10-24,within an hour,100%,f,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Westerpark,t,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",$300.00,$40.00,2,$15.00,4,20,7 weeks ago,0,0,0,47,42,2015-07-31,2018-08-29,96.0,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,f,f,1,1.03
2,20168,100%Centre-Studio 1 Private Floor/Bathroom,59484,Centrum-Oost,52.365087,4.893541,80,"Cozy studio on your own private floor, 100% in...",2009-12-02,within a few hours,100%,f,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,Centrum-Oost,t,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",,,2,$0.00,1,1000,today,0,7,24,198,233,2010-03-02,2018-11-30,87.0,9.0,10.0,9.0,9.0,10.0,9.0,f,strict_14_with_grace_period,f,f,2,2.18
3,25428,Lovely apt in City Centre (Jordaan),56142,Centrum-West,52.373114,4.883668,125,"This nicely furnished, newly renovated apt is...",2009-11-20,within a few hours,100%,f,2.0,2.0,"['email', 'phone', 'reviews']",t,f,Centrum-West,f,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",$300.00,$40.00,2,$10.00,14,60,2 days ago,2,32,44,141,1,2018-01-21,2018-01-21,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,f,f,2,0.09
4,27886,"Romantic, stylish B&B houseboat in canal district",97647,Centrum-West,52.386727,4.892078,150,Stylish and romantic houseboat on fantastic hi...,2010-03-23,within an hour,100%,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,Centrum-West,t,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",$0.00,$0.00,1,$0.00,2,730,today,16,37,54,199,171,2012-01-09,2018-11-25,99.0,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,f,f,1,2.03


In [254]:
# ['first_review', 'last_review'] - convert to datetime

listings_new[['first_review', 'last_review']] = listings_new[['first_review', 'last_review']].apply(pd.to_datetime) 

In [255]:
# Remove percentige sign from: 'host_response_rate'

listings_new['host_response_rate'] = listings_new['host_response_rate'].replace({'%': ''}, regex=True)
listings_new['host_response_rate'] = listings_new['host_response_rate'].astype(float)

In [256]:
# ['security_deposit', 'cleaning_fee', 'extra_people'] - remove dollar sign and convert to floats

listings_new[['security_deposit', 'cleaning_fee', 'extra_people']] = listings_new[[ 'security_deposit', 'cleaning_fee', 'extra_people']].replace({'\$': '', ',': ''}, regex=True)
listings_new[['security_deposit', 'cleaning_fee', 'extra_people']] = listings_new[['security_deposit', 'cleaning_fee', 'extra_people']].astype(float)

In [258]:
listings_new = listings_new.drop(columns = ['name', 'neighbourhood_cleansed', 'description'])

In [248]:
listings_new.host_response_time.unique()

array(['within an hour', 'within a few hours', 'a few days or more',
       'within a day', nan], dtype=object)

In [260]:
#listings_new.to_csv('listings_new_2.csv')


###### Add district

In [13]:
listings_new_2.head()

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,price,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2818,3159,Oostelijk Havengebied - Indische Buurt,52.365755,4.941419,59,2008-09-24,within an hour,100.0,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,f,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",100.0,50.0,1,20.0,3,15,today,17,44,44,44,248,2009-03-30,2018-11-28,97.0,10.0,10.0,10.0,10.0,9.0,10.0,t,strict_14_with_grace_period,f,f,1,2.1
1,3209,3806,Westerpark,52.390225,4.873924,160,2008-10-24,within an hour,100.0,f,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,t,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",300.0,40.0,2,15.0,4,20,7 weeks ago,0,0,0,47,42,2015-07-31,2018-08-29,96.0,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,f,f,1,1.03
2,20168,59484,Centrum-Oost,52.365087,4.893541,80,2009-12-02,within a few hours,100.0,f,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,t,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",,,2,0.0,1,1000,today,0,7,24,198,233,2010-03-02,2018-11-30,87.0,9.0,10.0,9.0,9.0,10.0,9.0,f,strict_14_with_grace_period,f,f,2,2.18
3,25428,56142,Centrum-West,52.373114,4.883668,125,2009-11-20,within a few hours,100.0,f,2.0,2.0,"['email', 'phone', 'reviews']",t,f,f,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",300.0,40.0,2,10.0,14,60,2 days ago,2,32,44,141,1,2018-01-21,2018-01-21,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,f,f,2,0.09
4,27886,97647,Centrum-West,52.386727,4.892078,150,2010-03-23,within an hour,100.0,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,t,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",0.0,0.0,1,0.0,2,730,today,16,37,54,199,171,2012-01-09,2018-11-25,99.0,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,f,f,1,2.03


In [15]:
districts.head()

Unnamed: 0,district,neighbourhood
0,Amsterdam-Zuidoost,Bijlmer-Centrum
1,Amsterdam-Zuidoost,Bijlmer-Oost
2,Amsterdam-West,Bos en Lommer
3,Amsterdam-Zuid,Buitenveldert - Zuidas
4,Amsterdam-Centrum,Centrum-Oost


In [20]:
col         = 'neighbourhood'
conditions  = [listings_new_2[col] == 'Bijlmer-Centrum', listings_new_2[col] == 'Bijlmer-Oost', 
               listings_new_2[col] == 'Bos en Lommer', listings_new_2[col] == 'Buitenveldert - Zuidas',
              listings_new_2[col] == 'Centrum-Oost', listings_new_2[col] == 'Centrum-West',
              listings_new_2[col] == 'De Aker - Nieuw Sloten', listings_new_2[col] == 'De Baarsjes - Oud-West',
              listings_new_2[col] == 'De Pijp - Rivierenbuurt', listings_new_2[col] == 'Gaasperdam - Driemond',
              listings_new_2[col] == 'Geuzenveld - Slotermeer', listings_new_2[col] == 'IJburg - Zeeburgereiland',
              listings_new_2[col] == 'Noord-Oost',  listings_new_2[col] == 'Noord-West', 
              listings_new_2[col] == 'Oostelijk Havengebied - Indische Buurt',listings_new_2[col] == 'Osdorp',
              listings_new_2[col] == 'Oud-Noord', listings_new_2[col] == 'Oud-Oost', listings_new_2[col] == 'Slotervaart',
              listings_new_2[col] == 'Watergraafsmeer', listings_new_2[col] == 'Westerpark', listings_new_2[col] == 'Zuid']

choices     = ['Amsterdam-Zuidoost', 'Amsterdam-Zuidoost', 'Amsterdam-West', 'Amsterdam-Zuid', 'Amsterdam-Centrum',
              'Amsterdam-Centrum', 'Amsterdam Nieuw-West', 'Amsterdam-West', 'Amsterdam-Zuid', 'Amsterdam-Zuidoost',
              'Amsterdam Nieuw-West', 'Amsterdam-Oost', 'Amsterdam-Noord', 'Amsterdam-Noord', 'Amsterdam-Oost',
              'Amsterdam Nieuw-West', 'Amsterdam-Noord', 'Amsterdam-Oost', 'Amsterdam Nieuw-West', 'Amsterdam-Oost',
              'Amsterdam-West', 'Amsterdam-Zuid']

listings_new_2['district'] = np.select(conditions, choices, default=np.nan)

In [82]:
#listings_new_2.to_csv('listings_new_2_distr.csv')

#### Reviews

In [106]:
reviews_details.head()

Unnamed: 0,id,date,reviewer_id,comments,district
0,2818,2009-03-30,10952,Daniel is really cool. The place was nice and ...,Amsterdam-Oost
1,2818,2009-04-24,12798,Daniel is the most amazing host! His place is ...,Amsterdam-Oost
2,2818,2009-05-03,11869,We had such a great time in Amsterdam. Daniel ...,Amsterdam-Oost
3,2818,2009-05-18,14064,Very professional operation. Room is very clea...,Amsterdam-Oost
4,2818,2009-05-25,17977,Daniel is highly recommended. He provided all...,Amsterdam-Oost


In [107]:
reviews_details.dtypes

id                      int64
date           datetime64[ns]
reviewer_id             int64
comments               object
district               object
dtype: object

In [108]:
reviews_details.shape

(431830, 5)

In [109]:
x = reviews_details.groupby('district')['comments'].count()
x.sort_values(ascending=False)

district
Amsterdam-Centrum       126381
Amsterdam-West          117376
Amsterdam-Zuid           76453
Amsterdam-Oost           57216
Amsterdam-Noord          25648
Amsterdam Nieuw-West     20347
Amsterdam-Zuidoost        7879
Name: comments, dtype: int64

In [96]:
reviews_details = reviews_details.drop(columns='reviewer_name')
reviews_details['date'] = pd.to_datetime(reviews_details['date'])
reviews_details = reviews_details.drop(columns='id')
reviews_details = reviews_details.rename(columns={'listing_id':'id'})

In [103]:
#reviews_details.to_csv('reviews_details_clean.csv')

###### Polarity 

In [110]:
test = reviews_details.iloc[:5,]
test

Unnamed: 0,id,date,reviewer_id,comments,district
0,2818,2009-03-30,10952,Daniel is really cool. The place was nice and ...,Amsterdam-Oost
1,2818,2009-04-24,12798,Daniel is the most amazing host! His place is ...,Amsterdam-Oost
2,2818,2009-05-03,11869,We had such a great time in Amsterdam. Daniel ...,Amsterdam-Oost
3,2818,2009-05-18,14064,Very professional operation. Room is very clea...,Amsterdam-Oost
4,2818,2009-05-25,17977,Daniel is highly recommended. He provided all...,Amsterdam-Oost


In [111]:
test['polarity_tb'] = test['comments'].apply(lambda x: TextBlob(x).sentiment.polarity)
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,date,reviewer_id,comments,district,polarity_tb
0,2818,2009-03-30,10952,Daniel is really cool. The place was nice and ...,Amsterdam-Oost,0.140741
1,2818,2009-04-24,12798,Daniel is the most amazing host! His place is ...,Amsterdam-Oost,0.365278
2,2818,2009-05-03,11869,We had such a great time in Amsterdam. Daniel ...,Amsterdam-Oost,0.313681
3,2818,2009-05-18,14064,Very professional operation. Room is very clea...,Amsterdam-Oost,0.451111
4,2818,2009-05-25,17977,Daniel is highly recommended. He provided all...,Amsterdam-Oost,0.222778


### listings_new_2_distr - update

In [9]:
listings_new_2_distr.head()

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,price,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,district
0,2818,3159,Oostelijk Havengebied - Indische Buurt,52.365755,4.941419,59,2008-09-24,within an hour,100.0,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,f,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",100.0,50.0,1,20.0,3,15,today,17,44,44,44,248,2009-03-30,2018-11-28,97.0,10.0,10.0,10.0,10.0,9.0,10.0,t,strict_14_with_grace_period,f,f,1,2.1,Amsterdam-Oost
1,3209,3806,Westerpark,52.390225,4.873924,160,2008-10-24,within an hour,100.0,f,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,t,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",300.0,40.0,2,15.0,4,20,7 weeks ago,0,0,0,47,42,2015-07-31,2018-08-29,96.0,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,f,f,1,1.03,Amsterdam-West
2,20168,59484,Centrum-Oost,52.365087,4.893541,80,2009-12-02,within a few hours,100.0,f,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,t,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",,,2,0.0,1,1000,today,0,7,24,198,233,2010-03-02,2018-11-30,87.0,9.0,10.0,9.0,9.0,10.0,9.0,f,strict_14_with_grace_period,f,f,2,2.18,Amsterdam-Centrum
3,25428,56142,Centrum-West,52.373114,4.883668,125,2009-11-20,within a few hours,100.0,f,2.0,2.0,"['email', 'phone', 'reviews']",t,f,f,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",300.0,40.0,2,10.0,14,60,2 days ago,2,32,44,141,1,2018-01-21,2018-01-21,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,f,f,2,0.09,Amsterdam-Centrum
4,27886,97647,Centrum-West,52.386727,4.892078,150,2010-03-23,within an hour,100.0,t,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,t,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",0.0,0.0,1,0.0,2,730,today,16,37,54,199,171,2012-01-09,2018-11-25,99.0,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,f,f,1,2.03,Amsterdam-Centrum


In [12]:
# property_type - keep first 11, the rest assign to 'other' 
listings_new_2_distr['property_type'].value_counts()[:12]

Apartment             15582
House                  1523
Townhouse               649
Bed and breakfast       455
Loft                    384
Boat                    372
Condominium             323
Houseboat               225
Guest suite             152
Aparthotel               73
Serviced apartment       63
Other                    51
Name: property_type, dtype: int64

In [13]:
listings_new_2_distr.loc[(listings_new_2_distr['property_type']!= 'Apartment') & (listings_new_2_distr['property_type']!='House')
             & (listings_new_2_distr['property_type']!='Townhouse') & (listings_new_2_distr['property_type']!='Bed and breakfast')
             & (listings_new_2_distr['property_type']!='Loft') & (listings_new_2_distr['property_type']!='Boat')
             & (listings_new_2_distr['property_type']!='Condominium') & (listings_new_2_distr['property_type']!='Houseboat')
             & (listings_new_2_distr['property_type']!='Guest suite') & (listings_new_2_distr['property_type']!='Aparthotel')
             & (listings_new_2_distr['property_type']!='Serviced apartment'),
             'property_type'] = 'Other'

listings_new_2_distr['property_type'].value_counts()

Apartment             15582
House                  1523
Townhouse               649
Bed and breakfast       455
Loft                    384
Boat                    372
Condominium             323
Other                   229
Houseboat               225
Guest suite             152
Aparthotel               73
Serviced apartment       63
Name: property_type, dtype: int64

In [25]:
#listings_new_2_distr.to_csv('listings_new_2_distr.csv')

#### Description

In [160]:
details = pd.read_csv('./Airbnb/listings_details.csv')

In [161]:
details_needed = details[['id','description']]

In [162]:
details_needed['description_lenght'] = details_needed.description.str.len()
details_needed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,description,description_lenght
0,2818,Quiet Garden View Room & Super Fast WiFi I'm r...,1000.0
1,3209,You will love our spacious (90 m2) bright apar...,1000.0
2,20168,"Cozy studio on your own private floor, 100% in...",1000.0
3,25428,"This nicely furnished, newly renovated apt is...",1000.0
4,27886,Stylish and romantic houseboat on fantastic hi...,1000.0


In [163]:
details_needed.to_csv('details_needed.csv')

#### Count the canceled bookings

In [152]:
reviews_canc = pd.read_csv('./Airbnb/reviews_details_phrases.csv')
reviews_canc = reviews_canc.drop(columns='Unnamed: 0')

In [153]:
reviews_canc.shape

(293034, 8)

In [155]:
reviews_canc['canceled'] = np.where(reviews_canc['comments'].str.contains('This is an automated posting'), 'yes', 'no')

In [156]:
reviews_canc['canceled'].value_counts()

no     292504
yes       530
Name: canceled, dtype: int64

In [157]:
reviews_canc.head()

Unnamed: 0,id,date,reviewer_id,comments,district,polarity,eng,phrases,canceled
0,2818,2009-03-30,10952,Daniel is really cool. The place was nice and ...,Amsterdam-Oost,0.140741,True,"['daniel', 'quiet neighborhood', 'central stat...",no
1,2818,2009-04-24,12798,Daniel is the most amazing host! His place is ...,Amsterdam-Oost,0.365278,True,"['daniel', 'amazing host', 'comfy bed', 'highly']",no
2,2818,2009-05-18,14064,Very professional operation. Room is very clea...,Amsterdam-Oost,0.451111,True,"['professional operation', 'location', 'helpfu...",no
3,2818,2009-09-06,26343,You can´t have a nicer start in Amsterdam. Dan...,Amsterdam-Oost,0.464773,True,"['amsterdam', 'daniel', 'daniel', 'great sense...",no
4,2818,2009-10-01,40999,Daniel was a fantastic host. His place is calm...,Amsterdam-Oost,0.368452,True,"['daniel', 'fantastic host', 'amazing city', '...",no


In [159]:
total = pd.DataFrame(reviews_canc.groupby('id')['eng'].count().reset_index())
col = ['id', 'total_comments']
total.columns=col
total.head()

Unnamed: 0,id,total_comments
0,2818,114
1,3209,27
2,20168,162
3,25428,1
4,27886,121


In [158]:
yes = reviews_canc[reviews_canc['canceled']=='yes']
yes_2 = pd.DataFrame(yes.groupby('id')['canceled'].count().reset_index())
yes_2.shape

(504, 2)

In [124]:
details_needed_2 = details_needed.merge(total, on='id')

In [129]:
details_needed_3 = details_needed_2.merge(yes_2, on='id')

In [134]:
details_needed_3.head()

Unnamed: 0,id,description,description_lenght,total_comments,canceled
0,3209,You will love our spacious (90 m2) bright apar...,1000.0,27,1
1,28871,In a monumental house right in the center of A...,570.0,155,2
2,29051,because of the city imposing a 4 paying guest ...,979.0,304,1
3,53671,Our room with private bathroom (including show...,1000.0,178,1
4,55256,Room in the old city center near New Market. A...,1000.0,93,1


In [137]:
# calculate the percentage of canceled bookings

details_needed_3['perc_canceled'] = details_needed_3['canceled'] * 100 / details_needed_3['total_comments']
details_needed_3.head()

Unnamed: 0,id,description,description_lenght,total_comments,canceled,perc_canceled
0,3209,You will love our spacious (90 m2) bright apar...,1000.0,27,1,3.703704
1,28871,In a monumental house right in the center of A...,570.0,155,2,1.290323
2,29051,because of the city imposing a 4 paying guest ...,979.0,304,1,0.328947
3,53671,Our room with private bathroom (including show...,1000.0,178,1,0.561798
4,55256,Room in the old city center near New Market. A...,1000.0,93,1,1.075269


In [138]:
details_needed_3.shape

(504, 6)

In [None]:
#details_needed_3.to_csv('details_needed.csv')