## Importing packages and attributes

In [1]:
import pandas as pd
import numpy as np
import re
from geopy.distance import great_circle

## Data Processing

In this notebook we will process the raw data scraped in our *`TripAdvisor_scraper.ipynb`* We start by loading the CSV file, we created.

In [2]:
trip_df = pd.read_csv("Tripadvisordata_raw.csv")
#trip_df.tail() # prints to check if the read went well

### Restructuring data

In [3]:
trip_df = trip_df.replace(regex=['&amp;'], value='&')
trip_df['Main rating'] = trip_df['Main rating'].replace(regex=[','], value='.')
trip_df['Good price'] = trip_df['God pris'] / 10
trip_df['Food'] = trip_df['Mad'] / 10
trip_df['Service'] = trip_df['Service'] / 10
trip_df['Atmosphere'] = trip_df['Stemning'] / 10
trip_df = trip_df[['Restaurant', 'Main rating', 'Ranking on list', 'Price range', 'Price class', 'Location', 'Good price', 'Food', 'Service', 'Atmosphere', 'Type of food', 'Number of reviews', 'Address', 'Type of food link']]

In [4]:
# edit missing data
trip_df["Price class"] = trip_df["Price class"].replace(np.nan, '--$$$$$$', regex=True)

# Translate the price class, to a numeric value

string = trip_df["Price class"]
string = [word.replace('nan','--$$$$$$') for word in string]

# find indexnumbers for more places with missing with missing data
list = []
for i in range(len(string)):    
    if "$" not in string[i]:
        list.append(i)

for i in list: 
    string[i] = '--$$$$$$'

In [5]:
# Filter the "-"'s      
new = []
for i in string:
    x = i.split('--')[1]
    new.append(x)


trip_df["New price class"] = new

#the tranlating keys
dollarsign = ["$", "$-$$", "$$", "$$-$$$", "$$$", "$$$-$$$$","$$$$", "$$$$-$$$$$", "$$$$$", '$$$$$$']
values = [1, 1, 1, 2, 2, 2, 3, 3, 3, 'null']

# make dataframe
dict_ = {'New price class': ["$", "$-$$", "$$", "$$-$$$", "$$$", "$$$-$$$$","$$$$", "$$$$-$$$$$", "$$$$$", '$$$$$$'], 'Price class numeric': [1, 1, 1, 2, 2, 2, 3, 3, 3, 99]}
translate_priceclass = pd.DataFrame(dict_)

new_trip_df = pd.merge(trip_df, translate_priceclass, on='New price class', how='left')

In [6]:
# Make our own ranking system
new_trip_df["Reverse ranking on list"] = (-1)*new_trip_df["Ranking on list"]

# sort on rating, ranking on list and number of reviews
new_trip_df = new_trip_df.sort_values(by=["Main rating", "Reverse ranking on list", "Number of reviews"], ascending=True)

# Add ranking
new_trip_df["Full ranking"] = range(1,len(new_trip_df["Number of reviews"])+1)

In [7]:
#køkken_list = ['Café', 'Afrikansk','Amerikansk','Arabisk','Argentinsk','Armensk','Aserbajdsjansk','Asiatisk','Bar','Belgisk','Brasiliansk','Britisk','Cajun og kreolsk','Cambodjansk','Canadisk','Caribisk','Centralasiatisk','Centraleuropæisk','Centralitaliensk','Dansk','Delikatesseforretning','Egyptisk','Etiopisk','Europæisk','Fastfood','Filippinsk','Fisk og skaldyr','Fra Lazio','Fra Shanghai','Fransk','Fusion','Gademad','Gastropub','Grill','Grillmad','Græsk','Hawaiiansk','Hollandsk','Hongkong','Indiansk','Indisk','Indonesisk','International','Irsk','Israelsk','Italiensk','Japansk','Kantonesisk','Kinesisk','Koreansk','Kroatisk','Latinamerikansk','Libanesisk','Malaysisk','Marokkansk','Mellemamerikansk','Mellemøstlig','Mexicansk','Middelhavsområdet','Moderne','Mongolsk','Nepalesisk','New Zealand','Norditaliensk','Norsk','Pakistansk','Persisk','Peruviansk','Pizza','Portugisisk''Pub','Russisk','Schweizisk','Siciliansk','Singaporeansk','Skandinavisk','Spansk','Specialiteter fra Beijing','Spisested','Steakhouse','Sund','Supper','Sushi','Svensk','Sydamerikansk','Syditaliensk','Szechuan','Taiwansk','Thai','Tibetansk','Toscansk','Tyrkisk','Tysk','Venezuelansk','Vietnamesisk','Vinstue','Xinjiang','Ølpub','Østeuropæisk','Østrigsk']
#most_pop = ['Café', "Italiensk", "Pizza", "Indisk", "Fisk og skaldyr", "Thai", "Steakhouse", "Mexikansk"]
#
#link_list = new_trip_df["Type of food link"]
#link_df = pd.DataFrame(link_list)
#
#food_list_1 = [] #list of list over kitchens
#for link in food_list_1:
#    food_list_1.append([x for x in køkken_list if x in link])
#    
#food_list_2 = []
#for link in food_list_1:
#    food_list_2.append([x for x in most_pop if x in link])
#food_list_2
#
#new_trip_df["Pop kitchen"] = food_list_1

In [8]:
#køkken_list = ['Café', 'Afrikansk','Amerikansk','Arabisk','Argentinsk','Armensk','Aserbajdsjansk','Asiatisk','Bar','Belgisk','Brasiliansk','Britisk','Cajun og kreolsk','Cambodjansk','Canadisk','Caribisk','Centralasiatisk','Centraleuropæisk','Centralitaliensk','Dansk','Delikatesseforretning','Egyptisk','Etiopisk','Europæisk','Fastfood','Filippinsk','Fisk og skaldyr','Fra Lazio','Fra Shanghai','Fransk','Fusion','Gademad','Gastropub','Grill','Grillmad','Græsk','Hawaiiansk','Hollandsk','Hongkong','Indiansk','Indisk','Indonesisk','International','Irsk','Israelsk','Italiensk','Japansk','Kantonesisk','Kinesisk','Koreansk','Kroatisk','Latinamerikansk','Libanesisk','Malaysisk','Marokkansk','Mellemamerikansk','Mellemøstlig','Mexicansk','Middelhavsområdet','Moderne','Mongolsk','Nepalesisk','New Zealand','Norditaliensk','Norsk','Pakistansk','Persisk','Peruviansk','Pizza','Portugisisk''Pub','Russisk','Schweizisk','Siciliansk','Singaporeansk','Skandinavisk','Spansk','Specialiteter fra Beijing','Spisested','Steakhouse','Sund','Supper','Sushi','Svensk','Sydamerikansk','Syditaliensk','Szechuan','Taiwansk','Thai','Tibetansk','Toscansk','Tyrkisk','Tysk','Venezuelansk','Vietnamesisk','Vinstue','Xinjiang','Ølpub','Østeuropæisk','Østrigsk']
#most_pop = ['Café', "Italiensk", "Pizza", "Indisk", "Fisk og skaldyr", "Thai", "Steakhouse", "Mexikansk"] # gathered from TripAdvisor

#link_list = new_trip_df["Type of food link"]
#link_df = pd.DataFrame(link_list)

#list_test = [] #list of list over kitchens
#for link in link_list:
#    list_test.append([x for x in køkken_list if x in link])
    
#link_test2 = []
#for link in link_list:
#    link_test2.append([x for x in most_pop if x in link])
#link_test2

#new_trip_df["Pop kitchen"] = link_test2

## Adding measures to the dataframe

Calculating the distance to our measure point, which will be Kongens Nytorv in Copenhagen Denmark.

In [9]:
distance_list = []
Kgs_Nytorv = '55.679977,12.5841893' #longitude and latitude for Kongens Nytorv

#calculating distance from nytorv to the coordinates in the list
def distance(x):
    Start = new_trip_df["Location"][x]
    Stop = Kgs_Nytorv
    distance_list.append(great_circle(Start, Stop).meters)
    
for x in trip_df.index:
    distance(x)
    
#appending to df 
new_trip_df["Distance from Kgs. Nytorv (m)"] = distance_list 
new_trip_df['Distance from Kgs. Nytorv (m)'] = new_trip_df['Distance from Kgs. Nytorv (m)'].round()

In [10]:
new_trip_df['Postal code'] = [str(x).split('København')[-1].split('Danmark')[0] for x in new_trip_df['Address']]

In [15]:
new_trip_df_test = new_trip_df[['Restaurant', 'Main rating', 'Good price', 'Food', 'Service', 'Atmosphere', 'Price range', 'New price class', 'Price class numeric', 'Type of food', 'Type of food link', 'Number of reviews', 'Address', 'Location', 'Distance from Kgs. Nytorv (m)', 'Postal code', 'Ranking on list', 'Reverse ranking on list', 'Full ranking']]
new_trip_df_test = new_trip_df_test.dropna(subset = ['Main rating'])
new_trip_df_test

Unnamed: 0,Restaurant,Main rating,Good price,Food,Service,Atmosphere,Price range,New price class,Price class numeric,Type of food,Type of food link,Number of reviews,Address,Location,Distance from Kgs. Nytorv (m),Postal code,Ranking on list,Reverse ranking on list,Full ranking
2142,Burger King,1.0,,,,,,$$$$$$,99,,,2.0,"Ellebjergvej 142, København 2450 Danmark","55.65107,12.50931",199.0,2450,2148.0,-2148.0,1
2147,Almanac,1.0,,,,,,$$$$$$,99,Dansk,"<div class=""header_links""><a href=""/Restaurant...",2.0,"Havnegade 44, København Danmark","55.67788,12.591933",459.0,,2146.0,-2146.0,2
2130,Star Midnight Kebab-Grill,1.0,,,,,101,$$-$$$,2,Grill,"<div class=""header_links""><a href=""/Restaurant...",3.0,"Istedgade 101, København Danmark","55.6679,12.54941",3431.0,,2143.0,-2143.0,3
2134,Sunset Boulevard,1.0,,,,,,$$$$$$,99,,,2.0,"Københavns Hovedbanegård, København 1570 Danmark","55.67502,12.580593",908.0,1570,2137.0,-2137.0,4
2096,Dwaraka Indisk Restaurant,1.0,,,,,141,$$-$$$,2,Indisk,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Frederiksborgvej 221, St, København 2860 Danmark","55.72732,12.524776",1171.0,2860,2104.0,-2104.0,5
2088,McDonald's,1.0,,,,,,$$$$$$,99,Fastfood,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Fisketorvet Shopping Center, København 1560 Da...","55.66268,12.56187",772.0,1560,2103.0,-2103.0,6
2081,Præg Kaffebar,1.0,,,,,,$$$$$$,99,Café,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Noerrebrogade 44, København 2200 Danmark","55.68939,12.55704",3390.0,2200,2100.0,-2100.0,7
2094,Cafe Wok,1.0,,,,,,$$$$$$,99,,,1.0,"Arne Jacobsens Alle 12, København 2300 Danmark","55.63103,12.57589",1811.0,2300,2099.0,-2099.0,8
2076,Kujaku,1.0,,,,,,$$$$$$,99,Sushi,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Nørregade 28, København 1165 Danmark","55.458576,12.183993",540.0,1165,2090.0,-2090.0,9
2072,Gastronomia Italiana Dieci,1.0,,,,,,$$$$$$,99,,,1.0,"Sønder Boulevard 40, København 1720 Danmark","55.66731,12.55365",477.0,1720,2085.0,-2085.0,10


We notice that the last Observation has few reviews, no price class thus we choose to drop that observation, so it isn't ranked as the best restaurant in the data. The rest of our data looks correct.

In [14]:
new_trip_df_test[new_trip_df_test.Restaurant != 'Osteria 16']

Unnamed: 0,Restaurant,Main rating,Good price,Food,Service,Atmosphere,Price range,New price class,Price class numeric,Type of food,Type of food link,Number of reviews,Address,Location,Distance from Kgs. Nytorv (m),Postal code,Ranking on list,Reverse ranking on list,Full ranking
2142,Burger King,1.0,,,,,,$$$$$$,99,,,2.0,"Ellebjergvej 142, København 2450 Danmark","55.65107,12.50931",199.0,2450,2148.0,-2148.0,1
2147,Almanac,1.0,,,,,,$$$$$$,99,Dansk,"<div class=""header_links""><a href=""/Restaurant...",2.0,"Havnegade 44, København Danmark","55.67788,12.591933",459.0,,2146.0,-2146.0,2
2130,Star Midnight Kebab-Grill,1.0,,,,,101,$$-$$$,2,Grill,"<div class=""header_links""><a href=""/Restaurant...",3.0,"Istedgade 101, København Danmark","55.6679,12.54941",3431.0,,2143.0,-2143.0,3
2134,Sunset Boulevard,1.0,,,,,,$$$$$$,99,,,2.0,"Københavns Hovedbanegård, København 1570 Danmark","55.67502,12.580593",908.0,1570,2137.0,-2137.0,4
2096,Dwaraka Indisk Restaurant,1.0,,,,,141,$$-$$$,2,Indisk,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Frederiksborgvej 221, St, København 2860 Danmark","55.72732,12.524776",1171.0,2860,2104.0,-2104.0,5
2088,McDonald's,1.0,,,,,,$$$$$$,99,Fastfood,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Fisketorvet Shopping Center, København 1560 Da...","55.66268,12.56187",772.0,1560,2103.0,-2103.0,6
2081,Præg Kaffebar,1.0,,,,,,$$$$$$,99,Café,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Noerrebrogade 44, København 2200 Danmark","55.68939,12.55704",3390.0,2200,2100.0,-2100.0,7
2094,Cafe Wok,1.0,,,,,,$$$$$$,99,,,1.0,"Arne Jacobsens Alle 12, København 2300 Danmark","55.63103,12.57589",1811.0,2300,2099.0,-2099.0,8
2076,Kujaku,1.0,,,,,,$$$$$$,99,Sushi,"<div class=""header_links""><a href=""/Restaurant...",1.0,"Nørregade 28, København 1165 Danmark","55.458576,12.183993",540.0,1165,2090.0,-2090.0,9
2072,Gastronomia Italiana Dieci,1.0,,,,,,$$$$$$,99,,,1.0,"Sønder Boulevard 40, København 1720 Danmark","55.66731,12.55365",477.0,1720,2085.0,-2085.0,10


In [None]:
#trip_df = trip_df.sort_values(by='Ranking on list', ascending=True)
#new_trip_df.to_csv("Tripadvisordata_final_test.csv", index=False)