# Setup and Utilties

In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import scipy

import ast
import collections

import matplotlib as plt
import seaborn as sb
from pycorenlp import StanfordCoreNLP
import json

In [2]:
# Load data from a file to a pandas dataframe
def load_data(state='ca', city='los-angeles'):
    #datasource = f"../../project_data/inside_airbnb/united-states/{state}/{city}/listings.csv"
    datasource = f"../listingsshort.csv"
    return pd.read_csv(datasource)

def load_reviews(state='ca', city='los-angeles'):
    datasource = f"../reviews.csv"
    #datasource = f"../../project_data/inside_airbnb/united-states/{state}/{city}/reviews.csv"
    return pd.read_csv(datasource)

# Remove columns not used for models.
# Format numerical string data into numerical datatypes
# Format t/f into binary representation
def clean_and_drop(df, drop_price=False, drop_amenities=True):
    df = df.drop(['listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
                'neighborhood_overview', 'neighbourhood_group_cleansed', 'picture_url', 'host_id', 'host_url',
                'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url',
                'host_neighbourhood', 'neighbourhood', 'bathrooms', 
                'calendar_last_scraped', 'first_review', 'last_review', 'license', 'host_verifications', 'calendar_updated'], axis="columns")

    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').astype('float') / 100.0
    df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype('float')
    price = df.price
    if drop_price:
        df = df.drop('price', axis="columns")
    if drop_amenities:
        df = df.drop('amenities', axis="columns")
    
    df.host_is_superhost = df.host_is_superhost.map({'t':1, "f":0})
    df.host_has_profile_pic = df.host_has_profile_pic.map({'t':1, "f":0})
    df.host_identity_verified = df.host_identity_verified.map({'t':1, "f":0})
    df.has_availability = df.has_availability.map({'t':1, "f":0})
    df.instant_bookable = df.instant_bookable.map({'t':1, "f":0})

    return df, price

# Run sentiment analysis on text string
def stanford_sentiment(text_str, nlp):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'timeout': 10000,
                   })
    resjson = json.loads(res)
    numSentence = len(resjson["sentences"])
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(resjson["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])

    # avg. of sentiment values 
    avgSentiment = np.mean(arraySentVal)

    return(avgSentiment)

# Sets what to run analysis on and adds to csv
def run_sentiment_analysis(df, nlp):
    for i in range(0,1000):
            avgSentiment = stanford_sentiment(df.comments[i].replace('\n'," "), nlp)
            df.loc[i,'avgSentiment'] = avgSentiment
    return df

# Adds sentiment analysis to df
def concat_sentiment_analysis(df, rdf):
    rdf = rdf.groupby("listing_id")["avgSentiment"].mean()
    result = pd.merge(df,rdf,left_on="id",right_on="listing_id")
    result = result.drop("id", axis="columns")
    return result

# Returns a set of all amenities
def get_amenities_set(df):
    amenities = set()
    for r in df.amenities:
        amenities = amenities.union(set(ast.literal_eval(r)))
    return amenities

# Returns a dict of {amenity: # of occurances in all rental properties in df}
def get_amenities_dict(df, cutoff=25):
    amenities = collections.Counter()
    for r in df.amenities:
        amenities.update(ast.literal_eval(r))
    return dict(collections.Counter({a: c for a, c in amenities.items() if c >= cutoff}))

def create_and_populate_amenity_cols(df, cutoff=25):
    amenities = [s.lower() for s in list(get_amenities_dict(df, cutoff).keys())]
    ndf = df.copy()
    ndf = pd.concat(
    [
        ndf,
        pd.DataFrame(
            [[0] * len(amenities)], 
            index=ndf.index, 
            columns=amenities
        )
    ], axis=1
    )
    counter = 0
    for r in ndf.index: # Set value of all amenities a rental has to 1, rest 0
        if not pd.isnull(ndf.loc[r, 'amenities']):
            ams_list = [s.lower() for s in ast.literal_eval(ndf.loc[r, 'amenities']) if s.lower() in amenities]
            ams_dict = {a: 0 for a in amenities}
            for a in ams_list:
                ams_dict[a] = 1
            ndf.loc[r, ams_dict.keys()] = ams_dict.values()
    return ndf

## Data Encoding

In [3]:
nlp = StanfordCoreNLP('http://localhost:9000')
df, price = clean_and_drop(load_data('ca', 'los-angeles'), True, False)
rdf = run_sentiment_analysis(load_reviews(), nlp)
result = concat_sentiment_analysis(df, rdf)
# a_df = create_and_populate_amenity_cols(df)
# a_df
a_df = result
result

  df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype('float')
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,avgSentiment
0,within a day,1.00,0.86,0.0,1.0,1.0,1.0,1.0,Mount Washington,34.10632,...,4.96,4.58,4.85,0,1,0,1,0,0.20,2.705394
1,within an hour,1.00,0.93,1.0,8.0,8.0,1.0,1.0,Silver Lake,34.09574,...,4.98,4.91,4.82,0,4,4,0,0,1.09,2.698995
2,within a few hours,1.00,0.81,1.0,8.0,8.0,1.0,1.0,Del Rey,33.98750,...,4.92,4.80,4.70,0,3,0,3,0,2.00,2.635437
3,within a few hours,1.00,0.81,1.0,8.0,8.0,1.0,1.0,Del Rey,33.98750,...,4.89,4.77,4.71,0,3,0,3,0,1.52,2.626382
4,within a few hours,1.00,,0.0,1.0,1.0,1.0,1.0,Culver City,33.98301,...,4.00,5.00,4.00,0,1,1,0,0,0.02,2.583333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12461,within a day,0.60,0.32,0.0,2.0,2.0,1.0,0.0,Carson,33.85376,...,4.50,5.00,4.50,0,2,0,2,0,0.06,
12462,within an hour,0.67,0.00,0.0,1.0,1.0,1.0,0.0,Baldwin Hills/Crenshaw,34.02431,...,4.95,4.91,4.95,0,1,1,0,0,1.61,
12463,within a few hours,1.00,0.50,0.0,2.0,2.0,1.0,1.0,Tarzana,34.16526,...,5.00,4.00,4.00,0,1,1,0,0,0.03,
12464,within a day,1.00,0.98,1.0,1.0,1.0,1.0,1.0,Malibu,34.03637,...,4.91,4.98,4.80,0,1,1,0,0,1.59,


In [4]:
categorical_cols = ["host_response_time", "neighbourhood_cleansed", "property_type", "bathrooms_text", "room_type"]
a_df = pd.get_dummies(a_df, columns=categorical_cols)


In [5]:
# Checking amenities encoding

# a_df_copy = a_df.copy()
# a_df.to_csv('./amenities_encoded_test.csv', errors='surrogatepass')
a_df

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,bathrooms_text_8.5 baths,bathrooms_text_8.5 shared baths,bathrooms_text_9 baths,bathrooms_text_Half-bath,bathrooms_text_Private half-bath,bathrooms_text_Shared half-bath,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,1.00,0.86,0.0,1.0,1.0,1.0,1.0,34.10632,-118.22361,3,...,0,0,0,0,0,0,0,0,1,0
1,1.00,0.93,1.0,8.0,8.0,1.0,1.0,34.09574,-118.27788,4,...,0,0,0,0,0,0,1,0,0,0
2,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.98750,-118.43200,1,...,0,0,0,0,0,0,0,0,1,0
3,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.98750,-118.43200,1,...,0,0,0,0,0,0,0,0,1,0
4,1.00,,0.0,1.0,1.0,1.0,1.0,33.98301,-118.38607,6,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12461,0.60,0.32,0.0,2.0,2.0,1.0,0.0,33.85376,-118.25562,2,...,0,0,0,0,0,0,0,0,1,0
12462,0.67,0.00,0.0,1.0,1.0,1.0,0.0,34.02431,-118.36152,2,...,0,0,0,0,0,0,1,0,0,0
12463,1.00,0.50,0.0,2.0,2.0,1.0,1.0,34.16526,-118.54018,6,...,0,0,0,0,0,0,1,0,0,0
12464,1.00,0.98,1.0,1.0,1.0,1.0,1.0,34.03637,-118.63590,8,...,0,0,0,0,0,0,1,0,0,0


In [6]:
# One-hot-encode data for passing into models
encoder = sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(df)
df_onehot = encoder.fit_transform(df).toarray()

# Model Operations

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model

In [8]:
knn_reg_model = KNeighborsRegressor()
knn_reg_model.fit(df_onehot, price)

ridge_reg_model = linear_model.Ridge(alpha=0.5)
ridge_reg_model.fit(df_onehot, price)

Ridge(alpha=0.5)

In [18]:
knn_reg_model.predict(df_onehot[96][None, :])

array([70.])