## Libraries

In [1]:
import re
import warnings
import datetime
import numpy as np
import pandas as pd
from langdetect import detect, detect_langs
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Load data

In [2]:
VITAL_COLS = ['name', 'description', 'price']

In [3]:
df_pra = pd.read_csv('./input/prague_06.csv')
df_ber = pd.read_csv('./input/berlin_06.csv')
df_ath = pd.read_csv('./input/athens_06.csv')
df_mad = pd.read_csv('./input/madrid_06.csv')
df_rom = pd.read_csv('./input/rome_06.csv')
df_lon = pd.read_csv('./input/london_06.csv')
df_par = pd.read_csv('./input/paris_06.csv')

df_pra['city'] = 'Prague'
df_ber['city'] = 'Berlin'
df_ath['city'] = 'Athens'
df_mad['city'] = 'Madrid'
df_rom['city'] = 'Rome'
df_lon['city'] = 'London'
df_par['city'] = 'Paris'

print(f'Prague:\t\t{df_pra.shape}')
print(f'Berlin:\t\t{df_ber.shape}')
print(f'Athens:\t\t{df_ath.shape}')
print(f'Madrid:\t\t{df_mad.shape}')
print(f'Rome:\t\t{df_rom.shape}')
print(f'London:\t\t{df_lon.shape}')
print(f'Paris:\t\t{df_par.shape}')

Prague:		(13768, 106)
Berlin:		(23808, 106)
Athens:		(10414, 106)
Madrid:		(20630, 106)
Rome:		(30836, 106)
London:		(82029, 106)
Paris:		(60822, 106)


In [4]:
df_pra = df_pra.dropna(subset = VITAL_COLS, axis = 0)
df_ber = df_ber.dropna(subset = VITAL_COLS, axis = 0)
df_ath = df_ath.dropna(subset = VITAL_COLS, axis = 0)
df_mad = df_mad.dropna(subset = VITAL_COLS, axis = 0)
df_rom = df_rom.dropna(subset = VITAL_COLS, axis = 0)
df_lon = df_lon.dropna(subset = VITAL_COLS, axis = 0)
df_par = df_par.dropna(subset = VITAL_COLS, axis = 0)

print(f'Prague:\t\t{df_pra.shape}')
print(f'Berlin:\t\t{df_ber.shape}')
print(f'Athens:\t\t{df_ath.shape}')
print(f'Madrid:\t\t{df_mad.shape}')
print(f'Rome:\t\t{df_rom.shape}')
print(f'London:\t\t{df_lon.shape}')
print(f'Paris:\t\t{df_par.shape}')

Prague:		(13633, 106)
Berlin:		(23449, 106)
Athens:		(10296, 106)
Madrid:		(20007, 106)
Rome:		(30578, 106)
London:		(80324, 106)
Paris:		(60087, 106)


## Detect language

In [5]:
def detect_lang(df):
    '''
    Takes as input a DataFrame and returns it with a 
    new column based on the language of the description
    '''
    df['lang'] = ''
    for i, row in df.iterrows():
        try:
            detection_result = detect_langs(str(row['description']))
            top_detection = detection_result[0]
            lang = top_detection.lang
            prob = top_detection.prob
            if lang == 'en' and prob > 0.90:
                df.loc[i, 'lang'] = 'en'
            else:
                df.loc[i, 'lang'] = 'other'
        except:
            df.loc[i, 'lang'] = 'error'
    return df

In [6]:
df_pra = detect_lang(df_pra)
df_ber = detect_lang(df_ber)
df_ath = detect_lang(df_ath)
df_mad = detect_lang(df_mad)
df_rom = detect_lang(df_rom)
df_lon = detect_lang(df_lon)
df_par = detect_lang(df_par)

In [7]:
df_pra = df_pra[df_pra['lang'] == 'en']
df_ber = df_ber[df_ber['lang'] == 'en']
df_ath = df_ath[df_ath['lang'] == 'en']
df_mad = df_mad[df_mad['lang'] == 'en']
df_rom = df_rom[df_rom['lang'] == 'en']
df_lon = df_lon[df_lon['lang'] == 'en']
df_par = df_par[df_par['lang'] == 'en']

print(f'Prague:\t\t{df_pra.shape}')
print(f'Berlin:\t\t{df_ber.shape}')
print(f'Athens:\t\t{df_ath.shape}')
print(f'Madrid:\t\t{df_mad.shape}')
print(f'Rome:\t\t{df_rom.shape}')
print(f'London:\t\t{df_lon.shape}')
print(f'Paris:\t\t{df_par.shape}')

Prague:		(12341, 107)
Berlin:		(15894, 107)
Athens:		(9350, 107)
Madrid:		(9167, 107)
Rome:		(18693, 107)
London:		(79574, 107)
Paris:		(31882, 107)


## Sample data

In [8]:
# to get uniform sample
subsample_size = 9_000

df_pra = df_pra.sample(n = subsample_size)
df_ber = df_ber.sample(n = subsample_size)
df_ath = df_ath.sample(n = subsample_size)
df_mad = df_mad.sample(n = subsample_size)
df_rom = df_rom.sample(n = subsample_size)
df_lon = df_lon.sample(n = subsample_size)
df_par = df_par.sample(n = subsample_size)

## Concatenate dataframes

In [9]:
df = pd.concat([df_pra, df_ber, df_ath, df_mad, df_rom, df_lon, df_par], axis = 0)
df = df.reset_index(drop = True)

df.to_csv('./input/df.csv', index = False)

print(df.shape)

df.head(3)

(63000, 107)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,lang
0,21945200,https://www.airbnb.com/rooms/21945200,20190629051214,2019-06-29,Art Inspired Apartment with Plaza View Balcony,Uncover an oasis of calm and comfort in this m...,Inside Merteuil’s place you’ll find 3 separate...,Uncover an oasis of calm and comfort in this m...,none,Zizkov is a quiet neighborhood close to the ci...,,The apartment is located 2 minutes by foot fro...,"If you book a stay at Mertuil’s place, you’ll ...",I’ll be there to welcome you and walk you thro...,- No excessive noise that will disturb the nei...,,,https://a0.muscache.com/4ea/air/v2//pictures/4...,,52266255,https://www.airbnb.com/users/show/52266255,Martin,2015-12-25,"Prague, Hlavní město Praha, Czech Republic",,within an hour,100%,,t,https://a0.muscache.com/im/pictures/88ad8ce6-c...,https://a0.muscache.com/im/pictures/88ad8ce6-c...,,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,"Praha 3, Hlavní město Praha, Czech Republic",Žižkov,Praha 3,,Prague,Hlavní město Praha,130 00,Prague,"Praha 3, Czech Republic",CZ,Czech Republic,50.08663,14.45482,f,Apartment,Entire home/apt,4,1.5,1.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",,"$1,208.00",,,$0.00,$800.00,2,$500.00,1,1125,1,2,1125,1125,1.4,1125.0,today,t,8,32,58,316,2019-06-29,97,65,2017-12-11,2019-06-27,99.0,10.0,10.0,10.0,10.0,10.0,9.0,f,,,t,f,moderate,f,f,1,1,0,0,5.14,en
1,27961954,https://www.airbnb.com/rooms/27961954,20190629051214,2019-06-29,"Praha 2 - VINOHRADY - COSY, BRIGHT FLAT w/PARKING","Our apartment is bright & spacious, just 12 mi...",,"Our apartment is bright & spacious, just 12 mi...",none,"The apartment is located in the Prague 2, Vino...",If you have any special needs (e.g. vegetarian...,The closest metro/tram station is Náměstí míru...,You will have the whole apartment to yourselves.,,House rules We would like to ask our guests t...,,,https://a0.muscache.com/im/pictures/15946a2d-d...,,2616806,https://www.airbnb.com/users/show/2616806,Štěpán,2012-06-12,"Prague 2, Prague, Czechia",,within an hour,100%,,t,https://a0.muscache.com/im/pictures/user/a1ed9...,https://a0.muscache.com/im/pictures/user/a1ed9...,Prague 2,2.0,2.0,"['email', 'phone', 'facebook', 'google', 'revi...",t,t,"Praha, Czech Republic",Prague 2,Praha 2,,Prague,,120 00,Prague,"Praha, Czech Republic",CZ,Czech Republic,50.07515,14.44263,f,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{Kitchen,""Paid parking off premises"",Heating,W...",,$984.00,,,$0.00,$560.00,1,$0.00,3,1125,3,3,1125,1125,3.0,1125.0,2 weeks ago,t,0,0,0,0,2019-06-29,36,36,2018-09-03,2019-06-16,96.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,,t,f,moderate,f,f,2,2,0,0,3.6,en
2,17568378,https://www.airbnb.com/rooms/17568378,20190629051214,2019-06-29,Flashback to Louis 1896,"Our place is close to Wenceslaw square, Main T...",,"Our place is close to Wenceslaw square, Main T...",none,,We offer coffee (espresso or instant) and tea....,The closest subway and tram station I.P.Pavlov...,,Me and my boyfriend are very flexible.,- Quiet house from 10:00 pm (residential house),,,https://a0.muscache.com/im/pictures/b47932f0-a...,,50309582,https://www.airbnb.com/users/show/50309582,Gabriela,2015-12-01,"Prague, Hlavní město Praha, Czech Republic","Jsem Gabriela, je mi 25 let a pracuji v bance....",within an hour,100%,,t,https://a0.muscache.com/im/pictures/user/e12a2...,https://a0.muscache.com/im/pictures/user/e12a2...,Prague 2,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"Prague, Czech Republic",Nové Město,Praha 2,,Prague,,120 00,Prague,"Prague, Czech Republic",CZ,Czech Republic,50.07214,14.4305,t,Apartment,Private room,2,1.5,1.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",,"$1,297.00",,,,,1,$0.00,2,1125,2,2,1125,1125,2.0,1125.0,6 weeks ago,t,0,0,0,0,2019-06-29,136,71,2017-04-16,2019-06-27,98.0,10.0,10.0,10.0,10.0,9.0,10.0,f,,,t,f,moderate,f,f,2,0,2,0,5.07,en


## Preprocess data

In [5]:
df = pd.read_csv('./input/df.csv')

NUM_COLS = [
    'id', 'price', 'experiences_offered', 'host_since', 'host_location', 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhood', 'lang', 'reviews_per_month',
    'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
    'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'zipcode', 'latitude', 'longitude',
    'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities',
    'square_feet', 'guests_included', 'minimum_nights', 'maximum_nights', 'has_availability', 'availability_30',
    'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
    'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value',
    'requires_license', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture',
    'require_guest_phone_verification', 'calculated_host_listings_count'  
]

TEXT_COLS = ['name', 'description', 'space', 'house_rules', 'access', 'interaction', 'neighborhood_overview', 'notes', 'transit']

df = df[NUM_COLS + TEXT_COLS]

In [6]:
def na_report(df):
    '''Prints info about missing data for all columns of a dataframe'''
    counts = df.isna().sum().sort_values(ascending = False)
    percents = (df.isna().sum() / df.isna().count()*100).sort_values(ascending = False)
    res = pd.concat([counts, percents], axis = 1, keys = ['Total', 'Percent'])
    return res

def clean_string(string):
    '''Removes unnecessary characters from string'''
    if pd.isna(string):
        return ''
    string = str(string)
    string = re.sub("[^\dA-Za-z\s]", "", string)
    return string

def one_hot_encode(df, cols, na_as_category = False):
    '''One-hot-encode specific columns of a dataframe'''
    df = pd.get_dummies(df, columns = [cols], dummy_na = na_as_category)
    return df

def clean_price(price):
    '''Remove currency signs from string and return a float'''
    price = str(price)
    price = re.sub("[^.\d]", "", price)
    try:
        price = float(price)
    except ValueError:
        return np.nan
    if (price <= 0.0):
        return np.nan
    return price

def get_month_delta(val):
    '''Compute months between now and input time'''
    if pd.isna(val):
        return np.nan
    now = datetime.datetime.now()
    date_object = pd.to_datetime(val)
    delta = int((now - date_object).days // 30)
    return delta

def get_day_delta(val):
    '''Compute days between now and input time'''
    if pd.isna(val):
        return np.nan
    now = datetime.datetime.now()
    date_object = pd.to_datetime(val)
    delta = int((now - date_object).days)
    return delta

def manh_dist(apartment_lat,apartment_long, centre_lat, centre_long): 
    '''Compute manhattan distance between two pairs of coordinates'''
    dist = np.abs(centre_lat - apartment_lat) + np.abs(centre_long - apartment_long)  
    dist = float(dist)
    return dist

def dist_to_centre(df):
    '''Compute distances to city centre for a DataFrame'''
    df['dist_to_centre'] = float(0.0)
    for i, row in df.iterrows():
        city = row['city']
        df.loc[i, 'dist_to_centre'] = manh_dist(
            row['latitude'],
            row['longitude'],
            centres_dict[city][0],
            centres_dict[city][1]
        )
    return df

centres_dict = {
    'Prague': (50.073658, 14.418540),
    'Berlin': (52.520008, 13.404954),
    'London': (51.509865, -0.118092),
    'Paris': (48.864716, 2.349014),
    'Athens': (37.983810, 23.727539),
    'Madrid': (40.416775, -3.703790),
    'Rome': (41.902782, 12.496366),
}

def get_text_length(text):
    '''Return text length'''
    if pd.isna(text):
        return np.nan
    text = clean_string(str(text))
    length = len(text.split(' '))
    length = int(length)
    return length   

def clean_percent(val):
    '''Cleans value from % signs and scales to 0...1'''
    if pd.isna(val):
        return np.nan
    val = str(val)
    val = val.replace("%", "")
    val = float(val) / 100.0
    return val

def encode_property_type(val):
    if pd.isna(val):
        return np.nan
    val = str(val)
    if val in top_property_type:
        return val
    return 'other'

top_property_type = df['property_type'].value_counts().index[0]

cols_to_encode = {
    'host_is_superhost': {'f': int(0), 't': int(1)},
    'host_has_profile_pic': {'f': int(0), 't': int(1)},
    'host_identity_verified': {'f': int(0), 't': int(1)},
    'is_location_exact': {'f': int(0), 't': int(1)},
    'requires_license': {'f': int(0), 't': int(1)},
    'instant_bookable': {'f': int(0), 't': int(1)},
    'has_availability': {'f': int(0), 't': int(1)},
    'is_business_travel_ready': {'f': int(0), 't': int(1)},
    'require_guest_profile_picture': {'f': int(0), 't': int(1)},
    'require_guest_phone_verification': {'f': int(0), 't': int(1)},
    'room_type': {
        'Shared room': int(0), 'Private room': int(1), 'Entire home/apt': int(2)
    },
    'host_response_time': {
        'within an hour': int(0), 'within a few hours': int(1),
        'within a day': int(2), 'a few days or more': int(3)
    },
    'cancellation_policy': {
        'flexible': int(0), 'flexible_new': int(0),
        'moderate': int(1), 'moderate_new': int(1), 'luxury_moderate': int(1),
        'strict': int(2), 'strict_new': int(2), 'strict_14_with_grace_period': int(2),
        'super_strict_30': int(3), 'super_strict_30_new': int(3), 'super_strict_60': int(3), 'super_strict_60_new': int(3)
    }   
}

In [6]:
na_report(df[NUM_COLS]).head()

Unnamed: 0,Total,Percent
host_acceptance_rate,63000,100.0
square_feet,61409,97.474603
neighbourhood_group_cleansed,45000,71.428571
host_response_time,14532,23.066667
host_response_rate,14532,23.066667


In [7]:
df = df.drop(['host_acceptance_rate', 'square_feet', 'neighbourhood_group_cleansed'], axis = 1)

In [8]:
df['num_host_verifications'] = df['host_verifications'].apply(lambda x: int(len(x.split(' '))))

In [9]:
verifs_dict = {}

# collect
for i, row in df.iterrows():
    verifs_found = [re.sub(r"[^\w]", "", str(elem)) for elem in row['host_verifications'].split(', ')]
    for elem in verifs_found:
        if elem in verifs_dict:
            verifs_dict[elem] += 1
        else:
            verifs_dict[elem] = 1

# sort
verifs_dict = {key: val for key, val in sorted(verifs_dict.items(), key = lambda item: item[1], reverse = True)}
verifs_dict

{'phone': 62293,
 'email': 59739,
 'reviews': 46702,
 'government_id': 44176,
 'jumio': 39509,
 'offline_government_id': 26752,
 'selfie': 17214,
 'identity_manual': 16582,
 'facebook': 10300,
 'work_email': 9009,
 'google': 4247,
 'manual_offline': 1671,
 'manual_online': 634,
 'kba': 548,
 'sent_id': 140,
 '': 108,
 'None': 106,
 'zhima_selfie': 47,
 'weibo': 23,
 'sesame': 7,
 'sesame_offline': 7,
 'photographer': 6}

In [10]:
top_verif_types = list(verifs_dict.keys())[2:10]

# prepare new columns
for verif_type in top_verif_types:
    df[f'host_verif_{verif_type}'] = int(0)

# populate new columns
for i, row in df.iterrows():
    verifs_found = str(row['host_verifications'])
    for verif_type in top_verif_types:
        df.loc[i, f'host_verif_{verif_type}'] = int(1) if verif_type in verifs_found else int(0)
        
# drop original
df = df.drop(['host_verifications'], axis = 1)

In [11]:
df['num_amenities'] = df['amenities'].apply(lambda x: int(len(x.split(' '))))

In [8]:
amens_dict = {}

# collect
for i, row in df.iterrows():
    amens_found = [re.sub(r"[^\w\s]", "", str(elem)).replace(" ", "_") for elem in row['amenities'].split(',')]
    for elem in amens_found:
        if elem in amens_dict:
            amens_dict[elem] += 1
        else:
            amens_dict[elem] = 1

# sort
amens_dict = {key: val for key, val in sorted(amens_dict.items(), key = lambda item: item[1], reverse = True)}
amens_dict

# print top 50
i = 0
for amen, freq in amens_dict.items(): 
    if i < 50:
        print(f"{amen}\t\t\t{freq}")
        i += 1

Wifi		60922
Essentials		59295
Heating		58542
Kitchen		57837
Hangers		50117
Hair_dryer		49294
Washer		48357
Iron		45554
Shampoo		44501
TV		44489
Laptop_friendly_workspace		43285
Hot_water		36324
Elevator		29642
Smoke_detector		28232
Familykid_friendly		27275
Refrigerator		27026
Dishes_and_silverware		24987
Bed_linens		24695
Air_conditioning		23962
Cooking_basics		23427
Stove		21094
Internet		20546
Coffee_maker		20449
Oven		20366
Host_greets_you		20346
First_aid_kit		20282
Fire_extinguisher		19494
Microwave		18880
Long_term_stays_allowed		17899
No_stairs_or_steps_to_enter		17307
Carbon_monoxide_detector		16595
Dryer		14549
Lock_on_bedroom_door		14545
Extra_pillows_and_blankets		13892
Paid_parking_off_premises		13236
Dishwasher		12574
Luggage_dropoff_allowed		12505
Cable_TV		11301
Private_entrance		11266
Buzzerwireless_intercom		11141
Free_street_parking		10575
Patio_or_balcony		9771
Self_checkin		9154
Pets_allowed		8921
Smoking_allowed		8714
translation_missing_enhosting_amenity_50		7687

In [13]:
top_amen_types = [
    'Washer', 'Hot_water', 'TV', 'Elevator', 'Smoke_detector', 'Breakfast',
    'Familykid_friendly', 'Refrigerator', 'Air_conditioning', 'Dishwasher',
    'Free_street_parking', 'Pets_allowed', 'Smoking_allowed', 'Self_checkin'    
]

# prepare new columns
for amen_type in top_amen_types:
    df[f'amenity_{amen_type}'] = int(0)

# populate new columns
for i, row in df.iterrows():
    amens_found = str(row['amenities']).replace(" ", "_").replace("/", "").replace("-", "")
    for amen_type in top_amen_types:
        df.loc[i, f'amenity_{amen_type}'] = int(1) if amen_type in amens_found else int(0)
        
# drop original
df = df.drop(['amenities'], axis = 1)

In [14]:
df['price'] = df['price'].apply(clean_price)

# fix czech currency to dollars
df.loc[df['city'] == 'Prague', 'price'] = df.loc[df['city'] == 'Prague', 'price'].apply(lambda x: 0.04389*x)

df['log_price'] = df['price'].apply(np.log)

In [15]:
na_report(df[['price', 'log_price']])

Unnamed: 0,Total,Percent
log_price,7,0.011111
price,7,0.011111


In [16]:
df = df.dropna(subset = ['price', 'log_price'], axis = 0)

In [17]:
df = df.replace(cols_to_encode)

In [18]:
df['host_multihost'] = df['calculated_host_listings_count'].apply(lambda num: int(1) if num > 1 else int(0))

df = df.drop(['calculated_host_listings_count'], axis = 1)

In [19]:
df['as_host_months'] = df['host_since'].apply(get_month_delta)
df['days_since_last_review'] = df['last_review'].apply(get_day_delta)
df['months_since_first_review'] = df['first_review'].apply(get_month_delta)

df = df.drop(['host_since', 'last_review', 'first_review'], axis = 1)

In [20]:
# df['bathrooms'] = df['bathrooms'].astype(int)
# df['bedrooms'] = df['bedrooms'].astype(int)
# df['beds'] = df['beds'].astype(int)
# df['cancellation_policy'] = df['cancellation_policy'].astype(int)

In [21]:
df['host_response_rate'] = df['host_response_rate'].apply(clean_percent)

In [22]:
df = dist_to_centre(df)

In [23]:
df = one_hot_encode(df, 'city')

In [24]:
df['property_type'] = df['property_type'].apply(encode_property_type)

df = one_hot_encode(df, 'property_type')

df = df.drop(['property_type_other'], axis = 1)

In [25]:
df = df.drop([
    'id',
    'bed_type',
    'host_neighbourhood',
    'neighbourhood',
    'neighbourhood_cleansed',
    'zipcode',
    'host_location',
    'lang',
    'experiences_offered',
    'host_has_profile_pic',
    'has_availability',
    'is_business_travel_ready',
    'require_guest_profile_picture',
    'require_guest_phone_verification'
], axis = 1)

In [26]:
print(df.shape)

df.head(3)

(62993, 81)


Unnamed: 0,price,host_response_time,host_response_rate,host_is_superhost,reviews_per_month,host_listings_count,host_identity_verified,latitude,longitude,is_location_exact,room_type,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,cancellation_policy,name,description,space,house_rules,access,interaction,neighborhood_overview,notes,transit,num_host_verifications,host_verif_reviews,host_verif_government_id,host_verif_jumio,host_verif_offline_government_id,host_verif_selfie,host_verif_identity_manual,host_verif_facebook,host_verif_work_email,num_amenities,amenity_Washer,amenity_Hot_water,amenity_TV,amenity_Elevator,amenity_Smoke_detector,amenity_Breakfast,amenity_Familykid_friendly,amenity_Refrigerator,amenity_Air_conditioning,amenity_Dishwasher,amenity_Free_street_parking,amenity_Pets_allowed,amenity_Smoking_allowed,amenity_Self_checkin,log_price,host_multihost,as_host_months,days_since_last_review,months_since_first_review,dist_to_centre,city_Athens,city_Berlin,city_London,city_Madrid,city_Paris,city_Prague,city_Rome,property_type_Apartment
0,53.01912,0.0,1.0,1.0,5.14,1.0,0.0,50.08663,14.45482,0,2,4,1.5,1.0,1.0,2,1,1125,8,32,58,316,97,65,99.0,10.0,10.0,10.0,10.0,10.0,9.0,0,1,1.0,Art Inspired Apartment with Plaza View Balcony,Uncover an oasis of calm and comfort in this m...,Inside Merteuil’s place you’ll find 3 separate...,- No excessive noise that will disturb the nei...,"If you book a stay at Mertuil’s place, you’ll ...",I’ll be there to welcome you and walk you thro...,Zizkov is a quiet neighborhood close to the ci...,,The apartment is located 2 minutes by foot fro...,8,1,1,1,1,1,1,0,0,56,1,1,1,1,1,0,0,1,1,1,0,0,0,1,3.970653,0,54.0,343.0,30.0,0.049252,0,0,0,0,0,1,0,1
1,43.18776,0.0,1.0,1.0,3.6,2.0,1.0,50.07515,14.44263,0,2,2,1.0,1.0,1.0,1,3,1125,0,0,0,0,36,36,96.0,10.0,10.0,10.0,10.0,10.0,10.0,0,1,1.0,"Praha 2 - VINOHRADY - COSY, BRIGHT FLAT w/PARKING","Our apartment is bright & spacious, just 12 mi...",,House rules We would like to ask our guests t...,You will have the whole apartment to yourselves.,,"The apartment is located in the Prague 2, Vino...",If you have any special needs (e.g. vegetarian...,The closest metro/tram station is Náměstí míru...,10,1,1,1,0,1,1,1,1,25,1,1,0,0,0,0,0,1,0,0,0,0,0,0,3.765557,1,97.0,354.0,21.0,0.025582,0,0,0,0,0,1,0,1
2,56.92533,0.0,1.0,1.0,5.07,2.0,1.0,50.07214,14.4305,1,1,2,1.5,1.0,1.0,1,2,1125,0,0,0,0,136,71,98.0,10.0,10.0,10.0,10.0,9.0,10.0,0,1,1.0,Flashback to Louis 1896,"Our place is close to Wenceslaw square, Main T...",,- Quiet house from 10:00 pm (residential house),,Me and my boyfriend are very flexible.,,We offer coffee (espresso or instant) and tea....,The closest subway and tram station I.P.Pavlov...,9,1,1,1,1,1,1,0,1,21,1,0,1,0,0,1,0,0,1,0,0,0,0,0,4.04174,1,54.0,343.0,38.0,0.013478,0,0,0,0,0,1,0,1


In [27]:
na_report(df).head(15)

Unnamed: 0,Total,Percent
notes,34002,53.977426
access,26560,42.163415
house_rules,24857,39.45994
interaction,24127,38.301081
neighborhood_overview,19976,31.71146
transit,19629,31.160605
host_response_time,14531,23.067642
host_response_rate,14531,23.067642
space,13962,22.164367
review_scores_value,12055,19.137047


## Split data

In [28]:
df_train, df_test = train_test_split(
    df,
    test_size = 0.2
)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv('./input/df_pre_train.csv', index = False)
df_test.to_csv('./input/df_pre_test.csv', index = False)

(50394, 81)
(12599, 81)


### Sub-sample train set for deep learning models

In [11]:
df_train = df_train.sample(n = 0.25*len(d_train))

df_train, df_test = train_test_split(
    df_train,
    test_size = 0.2
)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv('./input/df_pre_train_deep.csv', index = False)
df_test.to_csv('./input/df_pre_test_deep.csv', index = False)

(10400, 81)
(2600, 81)
