In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
import datetime as dt
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn import metrics
from sklearn import svm
from sklearn import datasets
from sklearn.externals import joblib

pd.options.mode.chained_assignment = None  # default='warn'

# Preprocess
Read in the scraped calendar and insideAirbnb data

Input: 
* calendar_dir: calendar data
* inside_dir: insideAirbnb data

Output:
* df_listings: dataframe with the following variables
    * host listing count
    * host response rate
    * instant bookable
    * space shared with host
    * usual price
    * price variation
    * 1-month occupancy
    
# Model
Input:
* round1_merge.json
* round2_merge.json
* round3_merge.json

Output:


# Predict

Input:
* X:
    * "1-day orphan", "2-day orphan", "3-day_orphan", "Within 1 week", "1-2 weeks in advance", 
    * "orp_1, adv_1", "orp_1, adv_2", "orp_2, adv_1", "orp_2, adv_2", "orp_3, adv_1", "orp_3, adv_2",
    * "Percent off highest price" 
    * "Host listing count"
    * "Host response rate"
    * "Instant bookable"
    * "Space shared with host"
    * "Usual price" x
    * "Price variation" x
    * "1-month occupancy" x

In [117]:
class All(object):
    
    def __init__(self, calendar_dir=None, model_dir=None, inside_dir=None, check_in=None, check_out=None, city="nyc"):
        self.inside_raw = pd.read_csv("listings_%s.csv"%city)
        self.calendar_raw = pd.read_json("%s_cal.json"%city)
        self.check_in = check_in
        self.check_out = check_out
        # TODAY's DATE
        self.today_parsed = dt.datetime.today()  
    
    #################################################################################################################
    def Preprocess(self):
        # INITIALIZE GLOBAL VARIABLES
        today_parsed = self.today_parsed
        calendar_raw = self.calendar_raw
        inside_raw = self.inside_raw
       
    
        # CALENDAR: PARSE CALENDAR DATA INTO DATAFRAME
        parsed_calendars = {}
        for i in calendar_raw.keys():
            parsed_calendars[i] = self.parse_calendar(calendar_raw[i], today_parsed)
        listing_id = [int(key) for key in parsed_calendars.keys()]
        df_calendar = pd.DataFrame({'id': listing_id, 'calendars': parsed_calendars.values()})        
        
        # CALENDAR: ADD COLUMNS FOR FUTURE CALCULATION
        ### PRICE RELATED 
        calendar_price =  np.array([(np.median(calendar["price_USD"].values), 
                                     np.std(calendar["price_USD"].values), 
                                     np.max(calendar["price_USD"].values)) 
                                    for calendar in df_calendar["calendars"].values])
        calendar_median_price = calendar_price[:,0]
        calendar_price_std = calendar_price[:,1]
        calendar_price_high = calendar_price[:,2]
        df_calendar["price_median"] = calendar_median_price
        df_calendar["price_high"] = calendar_price_high
        df_calendar["price_std"] = calendar_price_std/calendar_median_price
        ### 1M OCCUPANCY
        key_1m = range(30)
        calendar_occupancy = [np.mean([calendar["availability"][key] for key in key_1m]) 
                              for calendar in df_calendar["calendars"].values]
        df_calendar["occupancy_1m"]=calendar_occupancy
    
    
        # INSIDEAIRBNB: PREPROCESS COLUMNS
        ### SHARED
        shared = inside_raw["room_type"].values
        inside_raw["shared"] = [x!="Entire home/apt" for x in shared]
        ### INSTANT
        instant = inside_raw["instant_bookable"].values
        inside_raw["instant"] = [x=="t" for x in shared]
        ### RESPONSE RATE
        response_imputer = Imputer(copy=True, missing_values='NaN', strategy='mean', axis=1)
        response_num = np.array([float(response_rate.strip('%'))/100 
                                 for response_rate in inside_raw["host_response_rate"].fillna(value="-100%").values])
        response_num = np.array([np.nan if x < 0 else x for x in response_num])
        response_imputed = response_imputer.fit_transform(response_num)[0]
        inside_raw["response_rate"] = response_imputed
    
        # SELECT USEFUL COLUMNS FROM INSIDEAIRBNB DATA
        inside_col = [u'id', u'response_rate', u'host_is_superhost', u'host_total_listings_count', 
                      u'number_of_reviews', u'instant', u'shared', u'beds']
        df_listing = inside_raw[inside_col]

        # MERGE CALENDAR WITH INSIDEAIRBNB DATA
        df_merged = pd.merge(df_calendar, df_listing, on='id', how='inner')
                
        self.df_merged = df_merged
        return df_merged
    
    # UTILITY FUNCTIONS FOR PREPROCESS
    def parse_calendar(self, calendar, today_parsed):
        date = []
        price_USD = []
        availability = []
        min_nights = []
        day_list = []
        for month in calendar['calendar_months']:
            for day in month['days']:
                day_parsed = dt.datetime.strptime(day['date'], '%Y-%m-%d')
                if (day_parsed > today_parsed) & (day_parsed not in day_list):
                    date.append(day['date'])
                    price_USD.append(day['price']['native_price'])
                    availability.append(day['available'])
                    min_nights.append(month['condition_ranges'][0]['conditions'][u'min_nights'])
                day_list.append(day_parsed)
        return pd.DataFrame({'date':date, 'price_USD': price_USD, 'availability':availability, 'min_nights': min_nights}) 
    #################################################################################################################
    
    #################################################################################################################
    def Model(self):
        # LOAD PREPROCESSED TRAINING DATA SILOS AND COMBINE THEM
        round3 = pd.read_json("round3_merged.json")
        round2 = pd.read_json("round2_merged.json")
        round1 = pd.read_json("round1_merged.json")
        round3_clean = round3[["id", "host_id", "host_response_rate", "host_acceptance_rate", 
                               "host_total_listings_count", "instant_bookable", 
                               "room_type", "bucket_name", "discount_asked", "nightly_price", 
                               "decision", "price_agreed", "discount_agreed", "percent_agreed",
                               "calendars", "price_requested"]]
        round2_clean = round2[["id", "host_id", "host_response_rate", "host_acceptance_rate", 
                               "host_total_listings_count", "instant_bookable", 
                               "room_type", "bucket_name", "discount_asked", "nightly_price", 
                               "decision", "price_agreed", "discount_agreed", "percent_agreed",
                               "calendars", "price_requested"]]
        round1_clean = round1[["id", "host_id", "host_response_rate", "host_acceptance_rate", 
                               "host_total_listings_count", "instant_bookable", 
                               "room_type", "bucket_name", "discount_asked", "nightly_price", 
                               "availability", "price", "discount",
                               "calendars", "price_requested"]]       
        round1_clean = round1_clean.rename(columns = {"availability":"decision", "price":"price_agreed", "discount":"percent_agreed"})
        round1_clean = round1_clean.rename(columns = {"availability":"decision", "price":"price_agreed", "discount":"percent_agreed"})
        round1_clean["percent_agreed"] = [np.nan if val==None else float(val.strip("%"))/100. for val in  round1_clean.percent_agreed.values]
        combined = pd.concat([round1_clean, round2_clean, round3_clean], keys=["round1", "round2", "round3"], ignore_index=False)
        combined["source"] = combined.index.labels[0]+1
        combined = combined.reset_index(drop=True)

        # CLEAN AND CONSTRUCT X VARIABLES (ESTIMATORS)
        ### PRICE RELATED
        calendar_price =  np.array([(np.median(calendar["price_USD"].values()),
                                     np.std(calendar["price_USD"].values()),
                                     np.max(calendar["price_USD"].values())) 
                                    for calendar in combined["calendars"].values])
        calendar_median_price = calendar_price[:,0]
        calendar_price_std = calendar_price[:,1]
        calendar_price_high = calendar_price[:,2]
        combined["price_median"]=calendar_median_price
        combined["price_std"]=calendar_price_std/calendar_median_price
        combined["orig_percent_off"]=1-combined["nightly_price"]/calendar_price_high
        ### OCCUPANCY
        key_1m = map(str, range(30))
        calendar_occupancy = [np.mean([calendar["availability"][key] for key in key_1m]) for calendar in combined["calendars"].values]
        combined["occupancy_1m"]=calendar_occupancy
        ### SHARED WITH HOST
        shared = combined["room_type"].values
        combined["shared"] = [x!="Entire home/apt" for x in shared]
        ### INSTANT BOOKABLE
        instant = combined["instant_bookable"].values
        combined["instant"] = [x=="t" for x in shared]
        ### RESPONSE RATE
        response_imputer = Imputer(copy=True, missing_values='NaN', strategy='mean', axis=1)
        response_num = np.array([float(response_rate.strip('%'))/100 for response_rate in combined["host_response_rate"].fillna(value="-100%").values])
        response_num = np.array([np.nan if x < 0 else x for x in response_num])
        response_imputed = response_imputer.fit_transform(response_num)[0]
        combined["response_rate"] = response_imputed
        ### BUCKETS (LONG...)
        opening_attr = combined["bucket_name"].values
        N = len(opening_attr)
        orp_1 = np.zeros(N)
        orp_2 = np.zeros(N)
        orp_3 = np.zeros(N)
        adv_1 = np.zeros(N)
        adv_2 = np.zeros(N)
        for (i,x) in enumerate(opening_attr):
            if x == "days1_weeks1":
                orp_1[i] = 1
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 1
                adv_2[i] = 0
            elif x == "days1_weeks2":
                orp_1[i] = 1
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 1
            elif x == "days1_weeksM":
                orp_1[i] = 1
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 0
            elif x == "days2_weeks1":
                orp_1[i] = 0
                orp_2[i] = 1
                orp_3[i] = 0
                adv_1[i] = 1
                adv_2[i] = 0
            elif x == "days2_weeks2":
                orp_1[i] = 0
                orp_2[i] = 1
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 1
            elif x == "days2_weeksM":
                orp_1[i] = 0
                orp_2[i] = 1
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 0
            elif x == "days3_weeks1":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 1
                adv_1[i] = 1
                adv_2[i] = 0
            elif x == "days3_weeks2":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 1
                adv_1[i] = 0
                adv_2[i] = 1
            elif x == "days3_weeksM":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 1
                adv_1[i] = 0
                adv_2[i] = 0
            elif x == "daysM_weeks1":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 1
                adv_2[i] = 0
            elif x == "daysM_weeks2":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 1
            elif x == "daysM_weeksM":
                orp_1[i] = 0
                orp_2[i] = 0
                orp_3[i] = 0
                adv_1[i] = 0
                adv_2[i] = 0
        combined["orp_1"]=orp_1
        combined["orp_2"]=orp_2
        combined["orp_3"]=orp_3
        combined["adv_1"]=adv_1
        combined["adv_2"]=adv_2
        combined["orp_1, adv_1"] = combined["orp_1"]*combined["adv_1"]
        combined["orp_1, adv_2"] = combined["orp_1"]*combined["adv_2"]
        combined["orp_2, adv_1"] = combined["orp_2"]*combined["adv_1"]
        combined["orp_2, adv_2"] = combined["orp_2"]*combined["adv_2"]
        combined["orp_3, adv_1"] = combined["orp_3"]*combined["adv_1"]
        combined["orp_3, adv_2"] = combined["orp_3"]*combined["adv_2"]
        
        # CLEAN AND CONSTRUCT Y VARIABLES (TARGETS)
        ### DISCOUNT
        orig_price = combined["nightly_price"]
        price_agreed_percent = (np.nan_to_num(combined["price_agreed"])/orig_price)
        discount_agreed1 = [0 if (d>=1 or d==0) else 1-d for d in price_agreed_percent]
        discount_agreed2 = np.nan_to_num(combined["discount_agreed"]/orig_price)
        discount_agreed3 = np.nan_to_num(combined["percent_agreed"])
        discount_obtained = np.max([discount_agreed1, discount_agreed2, discount_agreed3], axis=0)
        combined["discount_obtained"] = discount_obtained
        combined_clean = combined[["host_total_listings_count", "response_rate", "instant", "shared", 
                                       "price_median", "price_std", "occupancy_1m",
                                       "orp_1", "orp_2", "orp_3", "adv_1", "adv_2", 
                                       "orp_1, adv_1", "orp_1, adv_2", "orp_2, adv_1", "orp_2, adv_2", "orp_3, adv_1", "orp_3, adv_2",
                                       "orig_percent_off", "discount_asked", "decision", "discount_obtained", "source"]]

        # CREATE TRAINING SAMPLES
        mask_available = combined_clean["decision"] != -1 
        mask_agreed = combined_clean["discount_obtained"] != 0
        mask_no_source1 = combined_clean["source"] != 1
        combined_available = combined_clean[mask_available]
        combined_agreed = combined_clean[mask_agreed * mask_no_source1]
        cal_param_1 = ["orp_1", "orp_2", "orp_3", "adv_1", "adv_2"]
        cal_param_2 = ["orp_1, adv_1", "orp_1, adv_2", "orp_2, adv_1", "orp_2, adv_2", "orp_3, adv_1", "orp_3, adv_2"]
        host_param = ["orig_percent_off", "host_total_listings_count", "response_rate", "instant", "shared",  "price_median", "price_std", "occupancy_1m"]
        param_rf = cal_param_1 + host_param
        param_lr = cal_param_1 + host_param + cal_param_2
        X_responded_rf = combined_available[param_rf].values
        X_responded_lr = combined_available[param_lr].values
        Y_responded = combined_available["decision"].values
        X_all_rf = combined_clean[param_rf].values
        X_all_lr = combined_clean[param_lr].values
        Y_all = combined_clean["decision"].values 
        Y_all = np.max(zip(Y_all,np.zeros(len(Y_all))), axis=1)
        X_discount_rf = combined_agreed[param_rf].values
        X_discount_lr = combined_agreed[param_lr].values
        Y_discount = combined_agreed["discount_obtained"].values
        
        # CREATE PLOT LABELS
        label_rf = ["1-day orphan", "2-day orphan", "3-day_orphan", "Within 1 week", "1-2 weeks in advance",
                    "Percent off highest price", "Host listing count", "Host response rate", 
                    "Instant bookable", "Space shared with host", "Usual price", "Price variation", "1-month occupancy"]
        label_lr = ["1-day orphan", "2-day orphan", "3-day_orphan", "Within 1 week", "1-2 weeks in advance", 
                    "orp_1, adv_1", "orp_1, adv_2", "orp_2, adv_1", "orp_2, adv_2", "orp_3, adv_1", "orp_3, adv_2",
                    "Percent off highest price", "Host listing count", "Host response rate", 
                    "Instant bookable", "Space shared with host", "Usual price", "Price variation", "1-month occupancy"]
        
        # CLASSIFY NEGOTIABLE HOSTS WITH RANDOM FOREST
        clf_all_rf = RandomForestClassifier(n_estimators=1000, 
                                        max_features=int(np.sqrt(X_all_rf.shape[1])), 
                                        max_depth=None,
                                        min_samples_split=1)
        clf_all_rf.fit(X_all_rf, Y_all)
        
        # CLASSIFY ANTICIPATED DISCOUNT WITH RANDOM FOREST
        clf_discount_rf = RandomForestRegressor(n_estimators=1000, 
                                        max_features=int(X_discount_rf.shape[1]), 
                                        max_depth=None,
                                        min_samples_split=1)
        clf_discount_rf.fit(X_discount_rf, Y_discount)
        
        # SAVING THE MODEL TO EXTERNAL FILE
        with open('clf_all_rf.pkl', 'wb') as f:
            pickle.dump(clf_all_rf, f)
        with open('clf_discount_rf.pkl', 'wb') as f:
            pickle.dump(clf_discount_rf, f)

        return clf_all_rf, clf_discount_rf
    #################################################################################################################
    
    
    #################################################################################################################
    
    #################################################################################################################

In [118]:
test = All(city="nyc")
test.Model()



(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features=3, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features=13, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=1, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))

In [101]:
test = All(city="nyc")
test_df_calendar = test.Preprocess()
test_df_calendar.head()



Unnamed: 0,calendars,id,price_median,price_high,price_std,occupancy_1m,response_rate,host_is_superhost,host_total_listings_count,number_of_reviews,instant,shared,beds
0,availability date min_nights price...,3309572,90,90,0.055493,1.0,1.0,f,1,15,False,True,1
1,availability date min_nights price...,4556118,36,36,0.0,1.0,0.84,f,10,1,False,True,1
2,availability date min_nights price...,9412617,189,189,0.026184,0.3,1.0,f,1,0,False,False,3
3,availability date min_nights price...,1646607,80,80,0.0,0.866667,1.0,t,2,10,False,True,2
4,availability date min_nights price...,671765,29,29,0.0,1.0,0.98,f,6,98,False,True,2
