In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
import datetime as dt
from collections import Counter

## Utility Funstions


### 1. functions for plotting distributions:


In [None]:
def plot_distribution( current_population, label_input, color_input= 'b'):
    fig = plt.figure(figsize=[15,4])
    plt.subplot(1,2,1)
    plt.hist(current_population['price_median'].values,bins=20, color=color_input)
    plt.xlabel('Price median')
    plt.ylabel('Number of Listings')
    plt.title('Distribution of Median Price for ' + label_input )
    # plt.xlim([50,1400])
    #plt.xticks(np.linspace(0,700,25))

    plt.grid(False)
    plt.legend()

    plt.subplot(1,2,2)

    plt.hist(current_population['price_std'].values, bins=25, color=color_input)

    plt.xlabel('Std deviation')
    plt.ylabel('Number of Listings')
    plt.title('Distribution of Standard Deviation of Price for ' + label_input)

    plt.legend()
    plt.grid(False)
    print "Total element in "+label_input+": "+ str(len(current_population))
    print "Percent Listings with No Price Change at All for "+ label_input+": ", len(current_population[current_population['price_std']==0])*1./len(current_population)
    return

#Function for plotting population of continuous intervals, divided by ranges

def plot_interval(current_population,field, ranges, label_input ):
    #intervals=[]
    for i in range(len(ranges)-1):
        current_range= current_population[current_population[field].isin([ranges[i],ranges[i+1]] )]
        current_label= label_input + " range: [" +str(ranges[i])+","+str(ranges[i+1])+"]"
        plot_distribution(current_range,current_label )
    return


### 2. Acceptance Rate

In [2]:
def number_of(dataframe, field, field_value):
    auxiliar=dataframe.loc[dataframe[field]==field_value]
    return len(auxiliar)


#n accepted, n denied etc. depending on the kind of field

def acceptance_rate(field, types= None):
    if types == None:        
        types= round3_clean[field].unique()
        print types
    round3_accepted_f = {}
    round3_denied_f={}
    round3_all_negative_f={}
    round3_not_answer_f={}

    for el in types:
        round3_accepted_f[el] = number_of (round3_accepted, field, el)
        round3_denied_f[el] = number_of (round3_denied, field, el)
        round3_all_negative_f[el] = number_of (round3_all_negative, field, el)
        round3_not_answer_f[el] = number_of (round3_not_answer, field, el)

    for el in types:
        try:
            print "Category: "+ field+ " "+ " type: "+ str(el)+ " percentage over all replied: " + str(float(round3_accepted_f[el])/(round3_denied_f[el]+ round3_accepted_f[el]))
            print "Category: "+ field+ " "+ " type: "+ str(el)+ " percentage over all contacted: " + str(float(round3_accepted_f[el])/(round3_all_negative_f[el]+ round3_accepted_f[el]))

        except ZeroDivisionError:
            print "division by 0"
    result_dict= {"types": types, "accepted": round3_accepted_f, "denied": round3_denied_f,"not_answer":round3_not_answer_f,
                 "all_negative": round3_all_negative_f, "category": field}         
    return result_dict

            

def acceptance_rate_continuous(field, ranges):

    #types= round3_clean[field].unique()
    #print types
    round3_accepted_f = {}
    round3_denied_f={}
    round3_all_negative_f={}
    round3_not_answer_f={}

    for el in range(len(ranges)-1):
        print el
        
        round3_accepted_f[el] = len(round3_accepted[round3_accepted[field].isin(range(ranges[el],ranges[el+1],1)) ])
        round3_denied_f[el] = len(round3_denied[round3_denied[field].isin(range(ranges[el],ranges[el+1],1)) ])
        round3_all_negative_f[el] = len(round3_all_negative[round3_all_negative[field].isin(range(ranges[el],ranges[el+1],1) ) ])
        round3_not_answer_f[el] = len(round3_not_answer[round3_not_answer[field].isin(range(ranges[el],ranges[el+1],1) )])
        
        

    for el in range(len(ranges)-1):
        current_label= field + " range: [" +str(ranges[el])+","+str(ranges[el+1])+"]"

        try:
            print "Category: "+ field+ " "+ " type: "+ current_label+ " percentage over all replied: " + str(float(round3_accepted_f[el])/(round3_denied_f[el]+ round3_accepted_f[el]))
            print "Category: "+ field+ " "+ " type: "+ current_label+ " percentage over all contacted: " + str(float(round3_accepted_f[el])/(round3_all_negative_f[el]+ round3_accepted_f[el]))

        except ZeroDivisionError:
            print "division by 0"
            
    result_dict= {"ranges": ranges, "accepted": round3_accepted_f, "denied": round3_denied_f,"not_answer":round3_not_answer_f,
                 "all_negative": round3_all_negative_f, "category": field}         
    return result_dict


### 3. Percentage

In [3]:
import math

#standard error: sqrt(p*(1-p)/n) with p = positive/all 

def plot_percentage (percentage_dictionary, type_input, color= "g", error_size=float(0.5) ):
    p_replied= []
    p_all= []
    p_size= []
    p_size_answered= []
    #alphab = ['1 w', '2 w', 'More w']
    x_axis=[]
    p_std= []
    p_std_a= []
    if type_input == "categorical":
        print "categorical"
        for el in percentage_dictionary["types"]:
            try:
                repl=float(percentage_dictionary['accepted'][el])/(percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el])
                p_replied.append(repl)
                
                al=float(percentage_dictionary['accepted'][el])/(percentage_dictionary['accepted'][el]+percentage_dictionary['all_negative'][el])
                
                p_all.append(al )
                
                p_size_answered.append(str(percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el]))
                p_size.append(str(percentage_dictionary['accepted'][el]+percentage_dictionary['all_negative'][el]))
                #53(1 - 53/100))1/2
                p_std.append (math.sqrt(repl*(1-repl)/ (percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el]) ) )
                                                                                    
                p_std_a.append (math.sqrt(al*(1-al)/ (percentage_dictionary['accepted'][el]+percentage_dictionary['all_negative'][el]) )) 
                
                x_axis.append(el)
            except ZeroDivisionError:
                print "division by 0"        
    elif type_input == "continuous":
        for el in range(len(percentage_dictionary['ranges'])-1):
            current_label=  "[" +str(percentage_dictionary['ranges'][el])+","+str(percentage_dictionary['ranges'][el+1])+"]"            
            try:
                repl= float(percentage_dictionary['accepted'][el])/(percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el])
                p_replied.append(repl)
                al=float(percentage_dictionary['accepted'][el])/(float(percentage_dictionary['accepted'][el])+percentage_dictionary['all_negative'][el])                                                              
                p_all.append(al)
                p_size_answered.append(str(percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el]))               
                p_size.append(str(percentage_dictionary['accepted'][el]+percentage_dictionary['all_negative'][el]))
                p_std.append (math.sqrt(repl*(1-repl)/ (percentage_dictionary['accepted'][el]+percentage_dictionary['denied'][el]) ) )
                                                                                    
                p_std_a.append (math.sqrt(al*(1-al)/ (percentage_dictionary['accepted'][el]+percentage_dictionary['all_negative'][el]) )) 
                
                x_axis.append(current_label)
            except ZeroDivisionError:
                print "division by 0"                
    else:
        print "invalid type"
        return

    
    plt.subplot(1,2,1)
    
    pos = np.arange(len(x_axis))
    print np.arange(len(x_axis))
    print "AMONG REPLIED HOSTS"
    print "percentage:"
    print p_replied
    print "size"
    print p_size_answered
    print "std error:"
    print p_std_a
    print "\n"
    print "AMONG ALL CONTACTED HOSTS"

    print "percentage:"
    print p_all
    print "size"
    print p_size
    print "std error:"    
    print p_std
    print "average over all replied "+ str(np.mean(p_replied))
    print "average over all contacted "+ str(np.mean(p_all))
    
    width = 0.7     # gives histogram aspect to the bar diagram

    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    x_axis2= x_axis
    for idx, val in enumerate(x_axis):
        x_axis[idx]=str(x_axis[idx])+"\n"+str(p_size_answered[idx])
    ax.set_xticklabels(x_axis)
    
    plt.xlabel(percentage_dictionary['category'])
    plt.ylabel('Acceptance rate (%)')
    plt.title('Acceptance among the hosts who replied')
    
    '''
    plt.text(pos + (width / 2), 1.05*pos,
                '%d' % len(),
                ha='center', va='bottom')
    '''
    #, yerr=[int(i)/400 for i in p_size_answered]
    
    plt.bar(pos, p_replied, width, color =color,yerr=[i/error_size for i in p_std])
    
    ####################################quantity
    #rects1 = ax.bar(pos, p_replied, width, color = color)
    #autolabel(rects1)
    
    plt.show()
    plt.subplot(1,2,2)
    
    pos = np.arange(len(x_axis))
    width = 0.7     # gives histogram aspect to the bar diagram

    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    
    for idx, val in enumerate(x_axis):
        x_axis2[idx]= str(x_axis2[idx])+"\n"+str(p_size[idx])
    ax.set_xticklabels(x_axis2)
    
    plt.xlabel(percentage_dictionary['category'])
    plt.ylabel('Acceptance rate (%)')
    plt.title('Acceptance among all the hosts')

    #,yerr=[int(i)/491 for i in p_size]
    plt.bar(pos, p_all, width, color= color, yerr=[ i/error_size for i in p_std_a] )
    plt.show()
   

## Loading Data

In [5]:
fields_ = ["id", "host_id", "host_response_rate", "host_acceptance_rate", 
                       "host_total_listings_count", "monthly_price", "instant_bookable", 
                       "room_type", "bucket_name", "discount_asked", "nightly_price", 
                       "decision", "price_agreed", "discount_agreed", "percent_agreed",
                       "calendars", "price_requested", "number_of_reviews", "host_is_superhost"
                      ,"beds"]

In [19]:
round2 = pd.read_json("../data/analytics_merge/round2_merged.json")
round2_clean = round2[fields]
round3 = pd.read_json("../data/analytics_merge/round3_merged.json")
round3_clean = round3[fields]
round4 = pd.read_json("../data/analytics_merge/round4_merged.json")
round4['discount_agreed'] = round4['discount']
round4_clean = round4[fields]
round5 = pd.read_json("../data/analytics_merge/round5_merged.json")
round5_clean = round5[fields]

dataframes= [round2_clean, round3_clean, round4_clean, round5_clean]

complete_data = pd.concat(dataframes)
print len(complete_data)

1074


In [20]:
complete_listings = pd.read_csv('../data/listings/nyc_listings.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
complete_listings.columns

Index([u'id', u'listing_url', u'scrape_id', u'last_scraped', u'name',
       u'summary', u'space', u'description', u'experiences_offered',
       u'neighborhood_overview', u'notes', u'transit', u'thumbnail_url',
       u'medium_url', u'picture_url', u'xl_picture_url', u'host_id',
       u'host_url', u'host_name', u'host_since', u'host_location',
       u'host_about', u'host_response_time', u'host_response_rate',
       u'host_acceptance_rate', u'host_is_superhost', u'host_thumbnail_url',
       u'host_picture_url', u'host_neighbourhood', u'host_listings_count',
       u'host_total_listings_count', u'host_verifications',
       u'host_has_profile_pic', u'host_identity_verified', u'street',
       u'neighbourhood', u'neighbourhood_cleansed',
       u'neighbourhood_group_cleansed', u'city', u'state', u'zipcode',
       u'market', u'smart_location', u'country_code', u'country', u'latitude',
       u'longitude', u'is_location_exact', u'property_type', u'room_type',
       u'accommodates', u