In [3]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.5MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.7MB/s eta 0:00:01[K     |████████████▏                   | 30kB 2.2MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 2.2MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.2MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [15]:
import pandas as pd
import numpy as np
import category_encoders as ce
import bz2
import _pickle as cPickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn import metrics


In [16]:
df = pd.read_csv("/content/drive/My Drive/train.csv")

In [17]:
def extract_zipcode(X):
  """Extracts first 5 characters from string"""
  return X[:5] 

In [18]:
def enumerate_amenities(X):
  """Returns sum of number of amenities"""
  return len(X["amenities"].split(","))

In [19]:
def wrangle(X):
  """
  Wrangles and cleans dataframe
  """

  # Prevent Setting With Copy warning
  X = X.copy()

  """
  Converting X["amenities"] to countable list, replacing set of amenities
  with count of amenities
  """
  X["amenities"] = X.apply(enumerate_amenities, axis=1)

  # Filtering dataframe
  X = X.filter(["log_price", "property_type", "amenities", "room_type",  
                "accommodates", "bathrooms", "cancellation_policy", 
                "cleaning_fee", "instant_bookable", "zipcode", "bedrooms", 
                "beds"], axis=1)
  
  """
  Converting property_type to include manageable number of options for
  ordinal encoding
  """

  X["property_type"] = X["property_type"].replace(np.nan, "Other")
  apartment = X["property_type"].str.contains("Apartment")
  house = X["property_type"].str.contains("House")
  loft = X["property_type"].str.contains("Loft")
  hostel = X["property_type"].str.contains("Hostel")
  condo = X["property_type"].str.contains("Condominium")
  townhouse = X["property_type"].str.contains("Townhouse")
  earth_house = X["property_type"].str.contains("Earth House")
  othr_conditional = ~apartment & ~house & ~loft & ~hostel & ~condo & ~townhouse
  X.loc[earth_house, "property_type"] = "Other"
  X.loc[othr_conditional, "property_type"] = "Other"

  # Cleaning zipcode column, extracting zip code
  X["zipcode"] = X["zipcode"].replace("Near 91304", 91304)
  X["zipcode"] = X["zipcode"].replace("1m", 10023)
  X["zipcode"] = pd.to_numeric(X["zipcode"], errors="coerce")
  X["zipcode"] = X["zipcode"].replace(np.nan, X["zipcode"].median())
  X["zipcode"] = X["zipcode"].astype(str)
  X["zipcode"] = X["zipcode"].apply(extract_zipcode)
  X["zipcode"] = X["zipcode"].replace(".", "")  
  X["zipcode"] = X["zipcode"].astype(float)

  # Replacing NaN values with median
  X["bathrooms"] = X["bathrooms"].replace(np.nan, X["bathrooms"].median())
  X["bedrooms"] = X["bedrooms"].replace(np.nan, X["bedrooms"].median())
  X["beds"] = X["beds"].replace(np.nan, X["beds"].median())

  # Encoding categorical variables
  encoder = ce.OrdinalEncoder()
  X = encoder.fit_transform(X)

  # Converting data to integers for seamless entry into neural network
  X = X.astype(float)

  return X

In [20]:
# Applying wrangle function

df = wrangle(df)

In [21]:
# Splitting data set into training and test sets

train, test = train_test_split(df, test_size=0.2, random_state=7)

In [22]:
# Splitting train and test sets into X feature matrix and y target vector

target = "log_price"

X_train = train.drop(columns=target)
y_train = train[target]

X_test = test.drop(columns=target)
y_test = test[target]

In [23]:
# Finding mean baseline for df 

baseline = df[target].mean()
print(f'Baseline Mean: {baseline:.2f} log_price')

# Find baseline MSE

y_train = df[target].dropna()
y_pred = [baseline] * len(y_train)
mse = mean_squared_error(y_train, y_pred)
print(f'Baseline Mean Squared Error: {mse:.2f} log_price')

Baseline Mean: 4.78 log_price
Baseline Mean Squared Error: 0.51 log_price


In [24]:
y_train = train[target]

pipeline = make_pipeline(RandomForestRegressor(n_estimators=300,
                      max_features=9,
                      min_samples_leaf=4,
                      max_depth=26, 
                      n_jobs=-1, 
                      random_state=7)  
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k, 
                         scoring='neg_mean_squared_error')
print(f'MAE for {k} folds:', -scores)

MAE for 3 folds: [0.17699664 0.18092373 0.1785249 ]


In [25]:
model = RandomForestRegressor(n_estimators=300,
                              max_features=9,
                              min_samples_leaf=4,
                              max_depth=26, 
                              n_jobs=-1, 
                              random_state=7)  

model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=26, max_features=9, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [26]:
y_pred = model.predict(X_test)

In [27]:
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  

Mean Squared Error: 0.17059707337592653


In [28]:
def compressed_pickle(title, data):
 with bz2.BZ2File(title + '.pbz2', 'w') as f: 
    cPickle.dump(data, f)

In [29]:
compressed_pickle('rf_model', model) 

In [30]:
def decompress_pickle(file):
 data = bz2.BZ2File(file, 'rb')
 data = cPickle.load(data)
 return data

In [31]:
data = decompress_pickle('rf_model.pbz2') 

In [32]:
a = [[1,2,3,4,5,6,7,8,9,10,11]]

In [33]:
data.predict(a)

array([4.9845085])

In [36]:
def predict_price(X):
  """
  Uses model to predict price based on inputted features
  """
  return np.exp(model.predict(X))

In [37]:
predict_price(a)

array([146.13173402])