In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn import metrics
from sklearn import svm

pd.options.mode.chained_assignment = None  # default='warn'

# Preprocess
Read in the scraped calendar and insideAirbnb data

Input: 
* calendar_dir: calendar data
* inside_dir: insideAirbnb data

Output:
* df_listings: dataframe with the following variables
    * host listing count
    * host response rate
    * instant bookable
    * space shared with host
    * usual price
    * price variation
    * 1-month occupancy
    
# Model
Input:
* round1_merge.json
* round2_merge.json
* round3_merge.json

Output:


# Predict

Input:
* X:
    * "1-day orphan", "2-day orphan", "3-day_orphan", "Within 1 week", "1-2 weeks in advance", 
    * "orp_1, adv_1", "orp_1, adv_2", "orp_2, adv_1", "orp_2, adv_2", "orp_3, adv_1", "orp_3, adv_2",
    * "Percent off highest price" 
    * "Host listing count"
    * "Host response rate"
    * "Instant bookable"
    * "Space shared with host"
    * "Usual price" x
    * "Price variation" x
    * "1-month occupancy" x

In [98]:
class All(object):
    
    def __init__(self, calendar_dir=None, model_dir=None, inside_dir=None, check_in=None, check_out=None, city="nyc"):
        self.inside_raw = pd.read_csv("listings_%s.csv"%city)
        self.calendar_raw = pd.read_json("%s_cal.json"%city)
        self.check_in = check_in
        self.check_out = check_out
        # TODAY's DATE
        self.today_parsed = dt.datetime.today()  
    
    #################################################################################################################
    def Preprocess(self):
        # INITIALIZE GLOBAL VARIABLES
        today_parsed = self.today_parsed
        calendar_raw = self.calendar_raw
        inside_raw = self.inside_raw
       
    
        # CALENDAR: PARSE CALENDAR DATA INTO DATAFRAME
        parsed_calendars = {}
        for i in calendar_raw.keys():
            parsed_calendars[i] = self.parse_calendar(calendar_raw[i], today_parsed)
        listing_id = [int(key) for key in parsed_calendars.keys()]
        df_calendar = pd.DataFrame({'id': listing_id, 'calendars': parsed_calendars.values()})        
        
        # CALENDAR: ADD COLUMNS FOR FUTURE CALCULATION
        ### PRICE RELATED 
        calendar_price =  np.array([(np.median(calendar["price_USD"].values), 
                                     np.std(calendar["price_USD"].values), 
                                     np.max(calendar["price_USD"].values)) 
                                    for calendar in df_calendar["calendars"].values])
        calendar_median_price = calendar_price[:,0]
        calendar_price_std = calendar_price[:,1]
        calendar_price_high = calendar_price[:,2]
        df_calendar["price_median"] = calendar_median_price
        df_calendar["price_high"] = calendar_price_high
        df_calendar["price_std"] = calendar_price_std/calendar_median_price
        ### 1M OCCUPANCY
        key_1m = range(30)
        calendar_occupancy = [np.mean([calendar["availability"][key] for key in key_1m]) for calendar in df_calendar["calendars"].values]
        df_calendar["occupancy_1m"]=calendar_occupancy
    
    
        # INSIDEAIRBNB: PREPROCESS COLUMNS
        ### SHARED
        shared = inside_raw["room_type"].values
        inside_raw["shared"] = [x!="Entire home/apt" for x in shared]
        ### INSTANT
        instant = inside_raw["instant_bookable"].values
        inside_raw["instant"] = [x=="t" for x in shared]
        ### RESPONSE RATE
        response_imputer = Imputer(copy=True, missing_values='NaN', strategy='mean', axis=1)
        response_num = np.array([float(response_rate.strip('%'))/100 for response_rate in inside_raw["host_response_rate"].fillna(value="-100%").values])
        response_num = np.array([np.nan if x < 0 else x for x in response_num])
        response_imputed = response_imputer.fit_transform(response_num)[0]
        inside_raw["response_rate"] = response_imputed
    
        # SELECT USEFUL COLUMNS FROM INSIDEAIRBNB DATA
        inside_col = [u'id', u'response_rate', u'host_is_superhost', u'host_total_listings_count', 
                      u'number_of_reviews', u'instant', u'shared', u'beds']
        df_listing = inside_raw[inside_col]

        # MERGE CALENDAR WITH INSIDEAIRBNB DATA
        df_merged = pd.merge(df_calendar, df_listing, on='id', how='inner')
                
        self.df_merged = df_merged
        return df_merged
    
    # UTILITY FUNCTIONS FOR PREPROCESS
    def parse_calendar(self, calendar, today_parsed):
        date = []
        price_USD = []
        availability = []
        min_nights = []
        day_list = []
        for month in calendar['calendar_months']:
            for day in month['days']:
                day_parsed = dt.datetime.strptime(day['date'], '%Y-%m-%d')
                if (day_parsed > today_parsed) & (day_parsed not in day_list):
                    date.append(day['date'])
                    price_USD.append(day['price']['native_price'])
                    availability.append(day['available'])
                    min_nights.append(month['condition_ranges'][0]['conditions'][u'min_nights'])
                day_list.append(day_parsed)
        return pd.DataFrame({'date':date, 'price_USD': price_USD, 'availability':availability, 'min_nights': min_nights}) 
    #################################################################################################################
    
    #################################################################################################################
        
        

In [101]:
test = All(city="nyc")
test_df_calendar = test.Preprocess()
test_df_calendar.head()



Unnamed: 0,calendars,id,price_median,price_high,price_std,occupancy_1m,response_rate,host_is_superhost,host_total_listings_count,number_of_reviews,instant,shared,beds
0,availability date min_nights price...,3309572,90,90,0.055493,1.0,1.0,f,1,15,False,True,1
1,availability date min_nights price...,4556118,36,36,0.0,1.0,0.84,f,10,1,False,True,1
2,availability date min_nights price...,9412617,189,189,0.026184,0.3,1.0,f,1,0,False,False,3
3,availability date min_nights price...,1646607,80,80,0.0,0.866667,1.0,t,2,10,False,True,2
4,availability date min_nights price...,671765,29,29,0.0,1.0,0.98,f,6,98,False,True,2


In [None]:
class LDA(object):

    # Initializes with the number of topics
    def __init__(self, num_topics, num_docs, num_words, log_fac_words, log_fac_count, word):
        self.num_topics = num_topics
        self.num_docs = num_docs
        self.num_words = num_words
        self.log_fac_words = log_fac_words
        self.log_fac_count = log_fac_count

        # initialize theta_k ~ Dirichlet(alpha)
        # theta = (1 * k)
        alphas = np.random.gamma(1, 1) * np.random.dirichlet([1.] * num_topics)
#         alphas = np.random.randint(1, 11, size=num_topics)
        self.theta = np.random.dirichlet(alphas)

        # initialize beta_k,v ~ Dirichlet(alpha), each topic k sum up to 1
        # beta = (k * v)
        beta = np.zeros((num_topics, num_words))
        alphas = np.random.randint(1, 11, size=(num_topics, num_words))
        for k in range(num_topics):
#             alphas = np.random.gamma(1, 1) * np.random.dirichlet([1.] * num_words) + 1.00
            beta[k] = np.random.dirichlet(alphas[k])
        self.beta = beta

        # save w matrix
        # word = (d * v)
        self.sparse_w = word


    # This should run the M step of the EM algorithm
    def M_step(self):
        # ==============
        # Update Theta
        # ==============
        self.theta = np.sum(self.sparse_gamma.toarray(), axis=0)*1.0/num_docs

        # ===============
        # Update Beta
        # ===============
        before_marginalize = self.sparse_gamma.transpose().dot(self.sparse_w).toarray() + 10**-10
        denominator_one_column = np.sum(before_marginalize, axis=1)
        denominator = np.tile(denominator_one_column.reshape((self.num_topics, 1)), (1, self.num_words))
        beta = before_marginalize*1.0/denominator
        self.beta = beta
#         print beta[0]


    # This should run the E step of the EM algorithm
    # compute gamma(z_dk)
    def E_step(self):
        before_marginalize = np.tile(np.log(self.theta), (self.num_docs, 1))\
                             + self.sparse_w.dot(np.transpose(np.log(self.beta)))
#                              + np.tile(self.log_fac_words, (1, self.num_topics))\
#                              - np.tile(self.log_fac_count, (1, self.num_topics))


        log_gamma_denom = scipy.misc.logsumexp(before_marginalize, axis=1)
        log_gamma = before_marginalize - np.tile(log_gamma_denom.reshape((self.num_docs, 1)), (1, self.num_topics))
        gamma = np.exp(log_gamma)
        self.sparse_gamma = sparse.csr_matrix(gamma)

    # This function repeats E and M step
    def run_ME(self):
        log_lik_list = []
        # run for first 2 rounds
        self.E_step()
        self.M_step()
        log_lik_list.append(self.compute_log_likelihood())
        self.E_step()
        self.M_step()
        log_lik_list.append(self.compute_log_likelihood())

        while abs(log_lik_list[-2] - log_lik_list[-1]) >1:
            self.E_step()
            self.M_step()
            log_lik_list.append(self.compute_log_likelihood())
        self.log_lik_list = log_lik_list

    def compute_log_likelihood(self):
        w_log_beta = self.sparse_w.dot(np.transpose(np.log(self.beta))) # (d, k)
        log_theta = np.tile(self.theta, (self.num_docs, 1))
        log_theta_w_log_beta = log_theta + w_log_beta
        return np.sum(self.sparse_gamma.multiply(log_theta_w_log_beta))

    def plot_objective_function(self):
        plt.figure(figsize=(12,8))
        plt.plot(self.log_lik_list)
        plt.xlabel("iteration")
        plt.ylabel("log likelihood")

    # This should print the topics that you find
    # by listing the most likely words
    def print_topics(self, num_representing_words=10):
        for k in range(self.num_topics):
            ordered_index = np.argsort(self.beta[k])[::-1]
            print "=============== TOPIC %i ==============="%(k+1)
            for i in range(num_representing_words):
                print word_dict_lines[ordered_index[num_representing_words - i-1]].rstrip()
            print