In [20]:
import os
import json
import time
import math
import copy
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

# pd.set_option("display.max_rows", None, "display.max_columns", None)
import sys
# np.set_printoptions(threshold=sys.maxsize)


class Haversine:
    '''
    from: https://nathanrooy.github.io/posts/2016-09-07/haversine-with-python/

    use the haversine class to calculate the distance between
    two lon/lat coordnate pairs.
    output distance available in kilometers, meters, miles, and feet.
    example usage: Haversine([lon1,lat1],[lon2,lat2]).feet
    
    '''
    def __init__(self,coord1,coord2):
        lon1,lat1=coord1
        lon2,lat2=coord2
        
        R=6371000                               # radius of Earth in meters
        phi_1=math.radians(lat1)
        phi_2=math.radians(lat2)

        delta_phi=math.radians(lat2-lat1)
        delta_lambda=math.radians(lon2-lon1)

        a=math.sin(delta_phi/2.0)**2+\
           math.cos(phi_1)*math.cos(phi_2)*\
           math.sin(delta_lambda/2.0)**2
        c=2*math.atan2(math.sqrt(a),math.sqrt(1-a))
        
        self.meters=R*c                         # output distance in meters
        self.km=self.meters/1000.0              # output distance in kilometers
        self.miles=self.meters*0.000621371      # output distance in miles
        self.feet=self.miles*5280               # output distance in feet
    
    
    def withinTolerance(self, tolerance):
        if self.meters <= tolerance:
            return True
        else:
            return False


def assignParksToHotels(hotels_df, np_df, tolerance):
    '''
    hotels_df = dataframe of initial hotels data
    np_df = dataframe of initial national park
    
    Assigned park ids to a hotel 
    '''
    nearby = []
    additional = []
    hotels_record = {}
    for i, hotel_row in hotels_df.iterrows():
        hotel_coords = [hotel_row['longitude'], hotel_row['latitude']]
        hotel_key = hash(hotel_row['longitude']) + 11 * hotel_row['latitude']
        if hotel_key in hotels_record:     # To reference old computation bc multiple reviews exist for same hotel
            nearby.append(hotels_record[hotel_key])
            continue
        nearby_np = []
        for j, np_row in np_df.iterrows():
            np_coords = [np_row['coordinates/longitude'], np_row['coordinates/latitude']]
            np_radius = math.sqrt(float(np_row['area/square_km'].replace(",",""))) * 1000  # Radius of park in meters
            H = Haversine(hotel_coords, np_coords)
            if H.withinTolerance(tolerance + np_radius):
                temp_nearby = {
                    'id': np_row['id'],
                    'dist': H.meters - np_radius
                }
                nearby_np.append(temp_nearby)
        if len(nearby_np) == 0:
            nearby.append(np.nan)
            hotels_record[hotel_key] = np.nan
        elif len(nearby_np) == 1:
            nearby.append(nearby_np[0]['id'])
            hotels_record[hotel_key] = nearby_np[0]['id']
        else: # more than 1 nearby national parks
            nearby_np.sort(key=lambda x: x['dist'])
            nearby.append(nearby_np[0]['id'])
            hotels_record[hotel_key] = nearby_np[0]['id']
    
    hotels_df['nearby'] = nearby    
    assignments = list(hotels_record.values())    
    total_num = len(assignments)
    total_unassigned = assignments.count(np.nan)
    print(f"{(total_num - total_unassigned) / total_num}% of hotels ({total_num - total_unassigned}) are near national parks")
    
        
        


In [3]:
np_loc = "national_parks.csv"
trails_loc = "nationalparktrails.csv"
hotels_loc = "7282_1.csv"

np_df = pd.read_csv(np_loc)
trails_df = pd.read_csv(trails_loc)
hotels_df = pd.read_csv(hotels_loc)

In [4]:
#print (df["categories"])

#Loop through data and get counts and unique categories
categories = {}

for i, data in hotels_df.iterrows():
    line = data["categories"]
    words = line.split(",")
    
    for word in words:
        word = word.strip(" ")
        if(word in categories):
            categories[word] += 1
        else:
            categories[word] = 1
    
    #if(i > 100):
    #    break;
categories = pd.DataFrame.from_dict(categories, orient='index', columns=["Categorie"])
categories = categories.sort_values(by=['Categorie'], ascending=False)
#print(np.array(categories.index))

In [5]:
Positive = ['Resorts', 'Wedding Receptions & Parties', 'Banquet Rooms', 'Casinos', 'Wedding Sites', 
            'Banquet Facilities', 'Resorts & Vacation Cottages', 'Banquet Halls & Reception Facilities',
            'Boutique Hotels', 'Vacation Home Rental', 'Cabin Cottage & Chalet Rental', 
            'Golf Courses - Private' 'Party & Event Planners', 'Wedding Chapels', 'Resortresort_hotel',
            'Bowling Alleys', 'Luxury Hotels', 'Resort', 'Wine Country Hotels',  
            'Wedding Reception Locations & Services', 'Villas',  'Cottages', 'precooked and gourmet', 
            'Hotel and Resort', 'Gastronomic specialties', 'Spa', 'Health Spas', 'Golf Courses', 
            'Ski Resorts', 'Spa Resorts', 'Hotel and Brewery', 'ski & snowboard areas', 'Cocktail Lounges', 
            'Wedding Guests', 'Wedding Guest Accommodations', 'Wedding Planners', 'Driving Ranges', 
            'Wedding Workouts', 'Condominiums and Townhouses', 'Honeymoon Resorts', 'Ski Apparel', 
            'Air Sightseeing Tours', 'Resorts Reservations', 'Wedding Reception Locations', 
            'Cabins Cottages & Chalet Rental', 'Whiskey Bars', 'resort', 'Wedding Chapels & Ceremonies',
            'Cabins & Chalets','Spas', 'Public Golf Courses','Mediation Services', 'Attorneys',  'Private Golf Courses']


Neutral = ['Hotels', 'Hotel, Lodging', 'Family-Friendly Hotels', 'Tourist Hotels', 
           'Business Hotels', 'Convention & Meeting Facilities & Services', 'Weekend Getaway Hotels',
           'Bed & Breakfasts', 'Conventions Conferences & Trade Shows', 'Corporate Lodging', 
           'Travel & Transport', 'Accommodations & Lodging', 'Beach Hotels', 'Bed & Breakfast & Inns',
           'Meeting Facilities', 'Conference Room', 'Travel', 'Travel and Tourism', 'Restaurants',  
           'hotel', 'fun hotels in seattle', 'Accommodation Reservations', 'Apartment Hotels', 'Bar',
           'and Hotel Bar', 'Meeting Room',  'Cabins', 'Hotel & Motel Reservations', 'Real Estate Developers',
           'Home Builders', 'Hotels and Lodging', 'Bed and Breakfast', 'Reception Centers',
           'Hotel Downtown St. Cloud', 'B&bs / Inns',  'Inns',  'Meat', 'Vacation Packages',
           'Meeting & Event Planning Services', 'and Meeting Room South Eugene', 'and Meeting Room',
           'Travel & Leisure', 'Grill Restaurants', 'Swimming Pools', 'Family-friendly Hotels',
           'Hotel and Meeting Room', 'Conference & Convention Centers', 'Conventions', 'restaurants',
           'travel', 'preserved', 'Foods',  'Restaurant', 'Motels', 'Wine Bar', 'Breakfast Spot', 
           'Convention Center', 'Wholesale Copiers', 'American Restaurant', "O'fallon Hotels", 
           'and Meeting Room Downtown Houston', 'Hotels-Apartment', 'American Restaurants', 
           'Vacation Rentals', 'Bed & Breakfasts (b&b)', 'Hotel and Bed Breakfast Prudential - St. Botolph', 
           'Hotel Colorado Springs Airport', 'Corporate Housing', 'Apartments', 'operation',
           'and Meeting Room Park 100' 'Retreat Houses','Apartment Operators', 'dallas hotels',
           'dallas','Convention Services & Facilities', 'texas (tx)', 'united states',
           'Eufaula Hotels and Motels', 'Conference Centers','Halls Auditoriums & Ballrooms', 
           'Eufaula', 'Bars & Clubs', 'Bed & Breakfast Reservations', 'Vacation Homes Rentals & Sales',
           'Historical Sites', 'Tourist Homes', 'Picnic Grounds', 'E-Commerce', 'Nature Centers', 'Railroads',
           'Mexican', 'Condominium Management', 'Coffeehouses', 'Rail Tours', 'Hotel & Motel Consultants',
           'Tours', 'Cottages & Cabins', 'Homes & Residential Real Estate', 'Raton Hotels and Motels', 'Raton',
           'Arcade', 'Condominium Operators', 'Hotel Pool', 'Real Estate Rentals', 'California (ca)', 'Lodges',
           'Taverns', 'Bars', 'Retirement Apartments & Hotels', 'Uncategorized', 'Watsonville', 
           'Watsonville Hotels', 'Travel Agencies', 'Bed & Breakfast', 'Convention and Meeting Facilities and Services',
           'Hotel & Motel Management','hotels', 'Condominiums', 'Condominium & Townhouse Rental & Leasing', 
           'Real Estate Management', 'Hanukkah Events', 'Bar & Grills', 'Brew Pubs', 'Wine Bars', 'Pubs', 
           'Camps-Recreational', 'Pizza Place', 'Places Of Interest', 'Government Contractors', 
           'Tourist Information & Attractions', 'Lodges In', 'Cruise Lines & Agents' 'Convention Visitors & Information Centers',
           'General Contractors', 'Italian Restaurants', 'Travel Agency', 'Campgrounds', 'Marinas', 'Night Clubs',
           'Mediterranean Restaurants', 'Amusement Places & Arcades', 'Travel Agents', 'Family Style Restaurants',
           'Guide Service', 'Extended Stay Hotels', 'Attractions', 'solrjson:\\\\Illinois', 'Water Parks & Slides',
           'Apartment Finder & Rental Service', 'Student Housing & Services', 'Real Estate Agents', 'lodging house',
           'bar drinks', 'Travel Tours & Guide Services', 'Community Organizations','Educational Services',
           'Travel & Lodging', 'food catering', 'Trade Shows Expositions & Fairs', 'Burwell', 'Nursing & Convalescent Homes',
           'Rental Services' 'Burwell Hotels and Motels' 'MI' 'Retirement Communities', 'cakes desserts', 'Marketing Consultants',
           'Bicycle Tours', 'Flooring Materials & Supplies', 'Real Estate Buyer Brokers', 'professional', 'Catering',
           'Bars & Pubs', 'Tours & Charters', 'Dixfield', 'ME', 'Truck Stops & Plazas', 'Bernau Hotels', 'Executive Suites',
           'Rest Homes', 'Homes-Institutional & Aged', 'Caterers', 'Real Estate Rental Service', 
           'Campgrounds & Recreational Vehicle Parks', 'Ironwood', 'Colorado (co)', 'Assisted Living Facilities',
           'Nursing & Personal Care Facilities', 'Assisted Living & Elder Care Services', 'Disabled & Elderly Home Health Care',
           'Geriatric Care Nursing Homes', 'Retirement Homes', 'Willows Hotels', 'Intermediate Care Nursing Homes', 
           'Nursing Homes', 'solrjson:\\\\Ennis', 'MT', 'Gift Shops', 'Bed Breakfast Near North Valley', 
           'Food & Entertainment', 'Clare', 'Halls & Auditoriums', 'Party Planning Service', 'Willows', 'Sioux City']


Negative = ['Hotels & Motels', 'Motels', 'Budget Hotels', 'Extended Stay', '“Hotels Motels”', 'Hostel', 'Hotels and Motels',
            'Hotels And Motels', 'Motel and Hotel', 'hotels & motels', 'Motel', 'RV Parks', 'Hotel and Motel'] 



In [6]:
positive_count = 0
neutral_count = 0
negative_count = 0

stratify_results = []

count = 0
for i, data in hotels_df.iterrows():
    count += 1
    line = data["categories"]
    words = line.split(",")
    
    score = 0
    for word in words:
        if word in Positive:
            score += 1
        if word in Negative:
            score -= 1
    
    if(score > 0):
        stratify_results.append("High End")
        positive_count += 1
    elif(score == 0):
        stratify_results.append("Average")
        neutral_count += 1
    elif(score < 0):
        stratify_results.append("Low End")
        negative_count += 1
    else:
        print(score)
    
hotels_df["Stratify"] = stratify_results
# print(df[:10])    
    
print("Positive", positive_count)
print("Neutral", neutral_count)
print("Negative", negative_count)


Positive 2406
Neutral 28407
Negative 5099


In [14]:
TOLERANCE = 80467.2  # 50 miles
assignParksToHotels(hotels_df, np_df, TOLERANCE)
hotels_df['Park'] = hotels_df['nearby'].notna()

0.1627689429373246% of hotels (174) are near national parks


In [21]:
#https://stackoverflow.com/questions/13404468/t-test-in-pandas

#High end
High_Far  = hotels_df[(hotels_df['Stratify'] == "High End") & (hotels_df['Park'] == False)]
High_Near = hotels_df[(hotels_df['Stratify'] == "High End") & (hotels_df['Park'] == True)]

High_Ttest = ttest_ind(High_Far['reviews.ratings'], High_Near['reviews.ratings'])
print("t-statistic: ", High_Ttest[0],"p-value", High_Ttest[1])

#Average
Avg_Far  = hotels_df[(hotels_df['Stratify'] == "Average") & (hotels_df['Park'] == False)]
Avg_Near = hotels_df[(hotels_df['Stratify'] == "Average") & (hotels_df['Park'] == True)]

Avg_Ttest = ttest_ind(Avg_Far['reviews.ratings'], Avg_Near['reviews.ratings'])
print("t-statistic: ", Avg_Ttest[0],"p-value", Avg_Ttest[1])

#Low end
Low_Far  = hotels_df[(hotels_df['Stratify'] == "Average") & (hotels_df['Park'] == False)]
Low_Near = hotels_df[(hotels_df['Stratify'] == "Average") & (hotels_df['Park'] == True)]

Low_Ttest = ttest_ind(Low_Far['reviews.ratings'], Low_Near['reviews.ratings'])
print("t-statistic: ", Low_Ttest[0],"p-value", Low_Ttest[1])





KeyError: 'reviews.ratings'