# Airbnb Regression Test

In [1]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

__________________
## Load Data

_________________________
## Change Format

In [2]:
def change_tf_format(df_airbnb):
    df_airbnb=df_airbnb.replace(to_replace='t', value=True)
    df_airbnb=df_airbnb.replace(to_replace='f', value=False)
    df_airbnb.head()
    
    return df_airbnb

______________________
## Missing Values

### 1. Review scores rating

In [None]:

rsr_group = df_airbnb.groupby('number_of_reviews')['review_scores_rating']
df_airbnb[df_airbnb['review_scores_rating'].isnull()]['number_of_reviews'].value_counts()


#### Most of the missing values are because there aren't any reviews to calculate the rate from.

In [None]:

for r in df_airbnb.index[df_airbnb['review_scores_rating'].isnull()]:
    n_reviews = df_airbnb.loc[idx,'review_scores_rating']
    rsr_df['review_scores_rating'].fillna(rsr_group[n])
    #df_airbnb['review_scores_rating'] = rsr_df['review_scores_rating']

    
#df_airbnb['review_scores_rating'] = df_airbnb['review_scores_rating'].fillna(df_airbnb['number_of_reviews']*df_airbnb['review_scores_rating'].mean())


### 2. Bedrooms

In [5]:
#Bedrooms

def mv_bathrooms(df_airbnb):
    
    acc_med_bedrooms = df_airbnb.groupby('accommodates', sort=False)['bedrooms'].median()
    
    for idx in df_airbnb.index[df_airbnb['bedrooms'].isnull()]:
        n_acc = df_airbnb.loc[idx,'accommodates']
        df_airbnb.at[idx,'bedrooms'] = acc_med_bedrooms.iloc[int(n_acc)]
    
    return df_airbnb


### 3. Bathrooms

In [6]:
#Bathrooms

import math

def mv_bathrooms(df_airbnb):
    
    bedrooms_med = df_airbnb.groupby('bedrooms', sort=False)['bathrooms'].median()
    
    for idx in df_airbnb.index[df_airbnb['bathrooms'].isnull()]:
        n_bedrms = df_airbnb.loc[idx,'bedrooms']
        if(n_bedrms > 0): #replace by the median
            df_airbnb.at[idx,'bathrooms'] = bedrooms_med.iloc[int(n_bedrms)]
        
        else: #replace by the median bathroom
            df_airbnb.at[idx,'bathrooms'] = math.floor(df_airbnb['bathrooms'].mean())
        
    return df_airbnb


_________________________________
## Feature Engineering

In [7]:
def get_dum(df_airbnb):
    categorical=['room_type','bed_type','cancellation_policy', 'city']
    df_dummies=pd.get_dummies(df_airbnb[categorical])
    new_df = pd.concat([df_airbnb, df_dummies], axis=1)
    
    return new_df,df_dummies

________________________
## Load Data

In [8]:
df = pd.read_csv('data/train.csv')

df = change_tf_format(df)
df = mv_bathrooms(df)
df, df_dummies = get_dum(df)


df_test = pd.read_csv('data/test.csv')
df_test = change_tf_format(df_test)
df_test = mv_bathrooms(df_test)
df_test, df_dummies_test = get_dum(df_test)


______________________________
## Train Data

In [12]:
X_columns = ['bathrooms', 'accommodates','number_of_reviews'] + list(df_dummies.columns)
y_column = ['log_price']

# handle missing values
df_train = df[X_columns + y_column]
print(df_train.shape)

(51000, 23)


In [13]:
X_train = df_train[X_columns]
y_train = df_train[y_column]

## Test Data

In [14]:
#t_columns = ['bathrooms', 'accommodates','number_of_reviews'] + list(df_dummies.columns)
df_prediction = df_test[X_columns]

______________
## Train and Test

In [15]:
# Linear Regression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
df_test['log_price'] = lr_model.predict(df_prediction)

df_test[['id', 'log_price']].to_csv('submission_v1.csv', index=False)

In [16]:
# Gradient Boosting

gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
df_test['log_price'] = gb_model.predict(df_prediction)

df_test[['id', 'log_price']].to_csv('submission_v2.csv', index=False)

  y = column_or_1d(y, warn=True)


______________________
## Prepare Submission

In [17]:
df_test[['id', 'log_price']].to_csv('submission_v1.csv', index=False)