# AirBnB Regression

In [None]:
# import the libraries
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

## Load the Data

In [None]:
df_airbnb = pd.read_csv('data/train.csv')

In [None]:
print(df_airbnb.columns)
df_airbnb.head()

____________________________________
## Data Cleaning

### Add Columns

In [None]:
#Amenities

unique_amenities = []

for props in df_airbnb['amenities']:
    props = props.replace('"','').replace('{','').replace('}','').replace('/',' or ')
    props = props.split(',')
    
    for p in props:
        if (p not in unique_amenities):
            unique_amenities.append(p)
            
print(len(unique_amenities))    
print(unique_amenities)

In [None]:
import re

r = re.compile("translation missing*.")
to_remove = list(filter(r.match, unique_amenities))
print(to_remove)

for i in unique_amenities:
    df_airbnb[i] = 0

df_airbnb.head()

### Missing Values

___________________________________________

In [None]:
pd.set_option('display.max_colwidth', 0)
df_airbnb[['property_type','room_type','bed_type','cancellation_policy',
      'cleaning_fee','city','host_has_profile_pic','host_identity_verified',
     'host_response_rate','instant_bookable','neighbourhood']].agg(['unique']).transpose()

______________________________
## Feature Engineering

In [None]:
# handle missing values
df_train = df_train[X_columns + y_column]
print(df_train.shape)
df_train = df_train.fillna(0.0) # probably not a good idea for 'review_scores_rating'
print(df_train.shape)

## 1. Zipcode

In [None]:
# zipcode to numerical columns
df_zipcode = pd.get_dummies(df_airbnb['zipcode'])
print(df_zipcode.shape)

df = pd.concat([df_airbnb, df_zipcode], axis=1).fillna(0.0)
print(list(df.columns))
df.head()

## 2. 

## 3. Select the Columns

In [92]:
X_columns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating']
y_column = ['log_price']
df_train = df_train[X_columns + y_column]

NameError: name 'df_train' is not defined

____________________________________________________
## Train the Models

In [None]:
# split the data using sklearn

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
# train a linear regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

________________________________
## Evaluate the Models

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE', round(rmse, 2))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot(range(0,10), range(0,10), '--r', alpha=0.3, label='Line1')
plt.title('Linear Regression')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.show()

___________________________
## Prepare Submission

In [None]:
f_prediction = df_test[X_columns].fillna(0.0)
df_test['log_price'] = model.predict(df_prediction)
df_test[['id', 'log_price']]

In [None]:
df_test[['id', 'log_price']].to_csv('submission_linear_regression.csv', index=False)