# Airbnb Regression Test

In [1]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

__________________
## Load Data

In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,13662370,3.81,House,Private room,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.5,Real Bed,strict,True,...,41.85,-87.68,Pilsen Arts Community Custom Home,Pilsen,17,97.0,https://a0.muscache.com/im/pictures/81318153/a...,60608,1.0,1.0
1,4765892,4.94,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,2.0,Real Bed,strict,True,...,34.07,-118.25,Apartment 5 minutes from DTLA & Dodger Stadium,Echo Park,2,100.0,https://a0.muscache.com/im/pictures/aa00250e-0...,90012,1.0,1.0
2,21169968,4.94,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.7,-73.92,"Brand New Huge 2bdr apartment(L,M train 2 min)",Bushwick,25,88.0,https://a0.muscache.com/im/pictures/d9220535-c...,11237,2.0,3.0
3,7939196,4.87,Apartment,Entire home/apt,"{""Cable TV"",Internet,""Wireless Internet"",""Air ...",6,1.0,Real Bed,strict,True,...,40.74,-73.99,Grande Super Large APT !!!,Flatiron District,12,82.0,,10010,1.0,3.0
4,18161036,3.66,House,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",2,1.0,Real Bed,flexible,True,...,34.05,-117.73,Private Cozy and Clean Rooms in Pomona,,2,100.0,https://a0.muscache.com/im/pictures/e0c9b2f9-a...,91766,1.0,1.0


In [4]:
df_test.head()

Unnamed: 0,id,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,986942,Apartment,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",2,1.0,Real Bed,flexible,False,NYC,...,40.83,-73.95,En suite room Upper West Side,Hamilton Heights,1,100.0,https://a0.muscache.com/im/pictures/90125799/6...,10031.0,1.0,1.0
1,16436737,House,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",4,1.5,Real Bed,moderate,True,DC,...,38.92,-77.03,Cozy or King BR in a grand victorian,Columbia Heights,146,96.0,https://a0.muscache.com/im/pictures/64128167/5...,20009.0,1.0,2.0
2,18209634,Apartment,Private room,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",1,1.0,Real Bed,flexible,True,NYC,...,40.67,-73.95,COMFORTABLE & COZY-2 STOPS BARCLAY,Crown Heights,10,92.0,https://a0.muscache.com/im/pictures/56585377/e...,11225.0,1.0,1.0
3,15027024,Apartment,Private room,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",2,1.0,Real Bed,flexible,False,NYC,...,40.67,-73.95,Sunny & Colorful Private BR in Crown Heights,Crown Heights,0,,https://a0.muscache.com/im/pictures/d76d0e9a-e...,11225.0,1.0,1.0
4,18074243,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",6,1.0,Real Bed,strict,True,NYC,...,40.74,-73.91,Lovely & Cozy Apartment in Queens!,Woodside,45,93.0,https://a0.muscache.com/im/pictures/435d8af0-f...,11377.0,2.0,3.0


_________________________
# Data Cleaning

## Change Format

### 1. True & False

In [5]:
# Handel True and False values

def change_tf_format(df_airbnb):
    
    df_airbnb=df_airbnb.replace(to_replace='t', value=True)
    df_airbnb=df_airbnb.replace(to_replace='f', value=False)
    

### 2. Dates

In [6]:
# Date Columns

import time
import datetime

def format_date(date_string):

    if pd.isnull(date_string):
        return np.nan
    
    if type(date_string) != str:
        return date_string
    
    date_timestamp = time.mktime(datetime.datetime.strptime(date_string, "%m/%d/%Y").timetuple())    
    return date_timestamp

### 3. Clean & Split Amenities

In [7]:
# amenities
# Reference: https://www.kaggle.com/naamaavi/airbnb-price-prediction-regression-project

def clean_split_string(s):
    
    for c in ['"','{','}','[',']']:
        if (c in s):
            s = s.replace(c,'')
        
    for c in ['/',':',' ','-','.','&',')','(','\'','/']:
        if (c in s):
            s = s.replace(c,'_')
     
    if (',' not in s):
        return [s] #single amenity as a list
    
    return s.split(',') #return list of amenities

In [8]:
def clean_data(df_airbnb):
    
    # True & False
    change_tf_format(df_airbnb)
    
    # Dates
    df_airbnb['first_review'] = df_airbnb['first_review'].apply(format_date)
    df_airbnb['last_review'] = df_airbnb['last_review'].apply(format_date)
    df_airbnb['host_since'] = df_airbnb['host_since'].apply(format_date)
        

## Missing Values

In [9]:
print("Train Data:\n" , df.isnull().sum())
print('')
print("Test Data:\n" , df_test.isnull().sum())

Train Data:
 id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   133
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
description                   0
first_review              10856
host_has_profile_pic        127
host_identity_verified      127
host_response_rate        12551
host_since                  127
instant_bookable              0
last_review               10826
latitude                      0
longitude                     0
name                          0
neighbourhood              4675
number_of_reviews             0
review_scores_rating      11450
thumbnail_url              5629
zipcode                     645
bedrooms                     63
beds                         85
dtype: int64

Test Data:
 id                           0
pr

### 1. Review scores rating

In [10]:
def mv_review_scores(df_airbnb):
    
    # Group by Highest Correlation Column
    reviews_mean_rsr = df_airbnb.groupby('number_of_reviews')['review_scores_rating'].mean()
    
    # Fill the missing values
    for idx in df_airbnb.index[df_airbnb['review_scores_rating'].isnull()]:
        n_reviews = df_airbnb.loc[idx,'review_scores_rating']

        if n_reviews == 0: # most of the missing values are because there aren't any reviews to calculate the rate from.
            df_airbnb.at[idx,'review_scores_rating'] = 0

        elif n_reviews > 0:
            df_airbnb.at[idx,'review_scores_rating'] = reviews_mean_rsr.iloc[int(n_reviews)]

        else: #If it's null
            df_airbnb.at[idx,'review_scores_rating'] = df_airbnb['review_scores_rating'].mean()

### 2. Bedrooms

In [11]:
#Bedrooms

def mv_bedrooms(df_airbnb):
    
    acc_med_bedrooms = df_airbnb.groupby('accommodates', sort=False)['bedrooms'].median()
    
    for idx in df_airbnb.index[df_airbnb['bedrooms'].isnull()]:
        n_acc = df_airbnb.loc[idx,'accommodates']

        if n_acc > 0:
            df_airbnb.at[idx,'bedrooms'] = acc_med_bedrooms.iloc[int(n_acc)]

        else: # If 'accommodates' is null
            df_airbnb.at[idx,'bedrooms'] = df_airbnb['bedrooms'].median()


### 3. Bathrooms

In [12]:
#Bathrooms

import math

def mv_bathrooms(df_airbnb):
    
    bedrooms_med_baths = df_airbnb.groupby('bedrooms', sort=False)['bathrooms'].median()
    
    for idx in df_airbnb.index[df_airbnb['bathrooms'].isnull()]:
        n_bedrms = df_airbnb.loc[idx,'bedrooms']

        #if(n_bedrms > 0): #replace by the median number of bathrooms
        df_airbnb.at[idx,'bathrooms'] = bedrooms_med_baths.iloc[int(n_bedrms)]

        #else: #replace by the median bathroom
            #df_airbnb.at[idx,'bathrooms'] = df_airbnb['bathrooms'].median()


In [13]:
def fill_missing_values(df_airbnb):
    
    mv_review_scores(df_airbnb)
    mv_bedrooms(df_airbnb)
    mv_bathrooms(df_airbnb)
    

_________________________________
## Feature Engineering

In [14]:
# Amenities:

def get_amenities(df_airbnb):
    
    unique_amenities = []

    for idx, a_list in enumerate(df_airbnb['amenities']):        
        for a in a_list:
            if (a not in df_airbnb.columns):
                unique_amenities += [a]
                df_airbnb[a] = 0
            
            df_airbnb.at[idx, a] = 1
            
    return unique_amenities

In [15]:
# Dummies for categorical columns

def get_dum(df_airbnb):
    categorical=['room_type','bed_type','cancellation_policy', 'city']
    df_dummies=pd.get_dummies(df_airbnb[categorical])
    new_df = pd.concat([df_airbnb, df_dummies], axis=1)
    
    return new_df, df_dummies

________________________
## Apply Changes

In [16]:
# For training data

clean_data(df)
df['amenities'] = df['amenities'].apply(clean_split_string)
fill_missing_values(df)
unique_am = get_amenities(df)
df, df_dum = get_dum(df)


In [18]:
df.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee',
       ...
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60', 'city_Boston', 'city_Chicago',
       'city_DC', 'city_LA', 'city_NYC', 'city_SF'],
      dtype='object', length=174)

In [19]:
# For testing data

#clean_data(df_test)
fill_missing_values(df_test)
df_test['amenities'] = df_test['amenities'].apply(clean_split_string)
test_unique_am = get_amenities(df_test)
df_test, df_dummies_test = get_dum(df_test)


In [20]:
df_test.columns

Index(['id', 'property_type', 'room_type', 'amenities', 'accommodates',
       'bathrooms', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
       ...
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60', 'city_Boston', 'city_Chicago',
       'city_DC', 'city_LA', 'city_NYC', 'city_SF'],
      dtype='object', length=176)

______________________________
## Train Data

In [26]:
X_columns = ['bathrooms', 'accommodates','number_of_reviews'] + list(set(unique_am) & set(test_unique_am))

In [27]:
X_columns += list(df_dum.columns)
y_column = ['log_price']

df_train = df[X_columns + y_column]
print(df_train.shape)

(51000, 147)


In [28]:
# Train

X_train = df_train[X_columns]
y_train = df_train[y_column]

In [29]:
# Test
X_test = df_test[X_columns]
print(X_test.shape)

(23111, 146)


______________
## Train and Test

In [30]:
# Linear Regression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
df_test['log_price'] = lr_model.predict(X_test)

df_test[['id', 'log_price']].to_csv('submission_v1.csv', index=False)

NameError: name 'LinearRegression' is not defined

In [31]:
# Gradient Boosting

gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
df_test['log_price'] = gb_model.predict(X_test)

df_test[['id', 'log_price']].to_csv('submission_v3.csv', index=False)

  y = column_or_1d(y, warn=True)
