## Importing packages and attributes

In [4]:
import pandas as pd
import numpy as np
import re
from geopy.distance import great_circle

## Data Processing

In this notebook we will process the raw data scraped in our *`TripAdvisor_scraper.ipynb`* We start by loading the CSV file, we created.

In [5]:
trip_df = pd.read_csv("Tripadvisordata_raw.csv")
#trip_df.tail() # prints to check if the read went well

### Restructuring data

In [6]:
trip_df = trip_df.replace(regex=['&amp;'], value='&')
trip_df['Main rating'] = trip_df['Main rating'].replace(regex=[','], value='.')
trip_df['Good price'] = trip_df['God pris'] / 10
trip_df['Food'] = trip_df['Mad'] / 10
trip_df['Service'] = trip_df['Service'] / 10
trip_df['Atmosphere'] = trip_df['Stemning'] / 10
trip_df = trip_df[['Restaurant', 'Main rating', 'Ranking on list', 'Price range', 'Price class', 'Location', 'Good price', 'Food', 'Service', 'Atmosphere', 'Type of food', 'Number of reviews', 'Address', 'Type of food link']]

In [7]:
# edit missing data
trip_df["Price class"] = trip_df["Price class"].replace(np.nan, '--$$$$$$', regex=True)

# Translate the price class, to a numeric value

string = trip_df["Price class"]
string = [word.replace('nan','--$$$$$$') for word in string]

# find indexnumbers for more places with missing with missing data
list = []
for i in range(len(string)):    
    if "$" not in string[i]:
        list.append(i)

for i in list: 
    string[i] = '--$$$$$$'

In [8]:
# Filter the "-"'s      
new = []
for i in string:
    x = i.split('--')[1]
    new.append(x)


trip_df["New price class"] = new

#the tranlating keys
dollarsign = ["$", "$-$$", "$$", "$$-$$$", "$$$", "$$$-$$$$","$$$$", "$$$$-$$$$$", "$$$$$", '$$$$$$']
values = [1, 1, 1, 2, 2, 2, 3, 3, 3, 'null']

# make dataframe
dict_ = {'New price class': ["$", "$-$$", "$$", "$$-$$$", "$$$", "$$$-$$$$","$$$$", "$$$$-$$$$$", "$$$$$", '$$$$$$'], 'Price class numeric': [1, 1, 1, 2, 2, 2, 3, 3, 3, 99]}
translate_priceclass = pd.DataFrame(dict_)

new_trip_df = pd.merge(trip_df, translate_priceclass, on='New price class', how='left')

In [9]:
# Make our own ranking system
new_trip_df["Reverse ranking on list"] = (-1)*new_trip_df["Ranking on list"]

# sort on rating, ranking on list and number of reviews
new_trip_df = new_trip_df.sort_values(by=["Main rating", "Reverse ranking on list", "Number of reviews"])

# Add ranking
new_trip_df["Full ranking"] = range(1,len(new_trip_df["Number of reviews"])+1)

In [10]:
#køkken_list = ['Café', 'Afrikansk','Amerikansk','Arabisk','Argentinsk','Armensk','Aserbajdsjansk','Asiatisk','Bar','Belgisk','Brasiliansk','Britisk','Cajun og kreolsk','Cambodjansk','Canadisk','Caribisk','Centralasiatisk','Centraleuropæisk','Centralitaliensk','Dansk','Delikatesseforretning','Egyptisk','Etiopisk','Europæisk','Fastfood','Filippinsk','Fisk og skaldyr','Fra Lazio','Fra Shanghai','Fransk','Fusion','Gademad','Gastropub','Grill','Grillmad','Græsk','Hawaiiansk','Hollandsk','Hongkong','Indiansk','Indisk','Indonesisk','International','Irsk','Israelsk','Italiensk','Japansk','Kantonesisk','Kinesisk','Koreansk','Kroatisk','Latinamerikansk','Libanesisk','Malaysisk','Marokkansk','Mellemamerikansk','Mellemøstlig','Mexicansk','Middelhavsområdet','Moderne','Mongolsk','Nepalesisk','New Zealand','Norditaliensk','Norsk','Pakistansk','Persisk','Peruviansk','Pizza','Portugisisk''Pub','Russisk','Schweizisk','Siciliansk','Singaporeansk','Skandinavisk','Spansk','Specialiteter fra Beijing','Spisested','Steakhouse','Sund','Supper','Sushi','Svensk','Sydamerikansk','Syditaliensk','Szechuan','Taiwansk','Thai','Tibetansk','Toscansk','Tyrkisk','Tysk','Venezuelansk','Vietnamesisk','Vinstue','Xinjiang','Ølpub','Østeuropæisk','Østrigsk']
#most_pop = ['Café', "Italiensk", "Pizza", "Indisk", "Fisk og skaldyr", "Thai", "Steakhouse", "Mexikansk"]
#
#link_list = new_trip_df["Type of food link"]
#link_df = pd.DataFrame(link_list)
#
#food_list_1 = [] #list of list over kitchens
#for link in food_list_1:
#    food_list_1.append([x for x in køkken_list if x in link])
#    
#food_list_2 = []
#for link in food_list_1:
#    food_list_2.append([x for x in most_pop if x in link])
#food_list_2
#
#new_trip_df["Pop kitchen"] = food_list_1

In [11]:
køkken_list = ['Café', 'Afrikansk','Amerikansk','Arabisk','Argentinsk','Armensk','Aserbajdsjansk','Asiatisk','Bar','Belgisk','Brasiliansk','Britisk','Cajun og kreolsk','Cambodjansk','Canadisk','Caribisk','Centralasiatisk','Centraleuropæisk','Centralitaliensk','Dansk','Delikatesseforretning','Egyptisk','Etiopisk','Europæisk','Fastfood','Filippinsk','Fisk og skaldyr','Fra Lazio','Fra Shanghai','Fransk','Fusion','Gademad','Gastropub','Grill','Grillmad','Græsk','Hawaiiansk','Hollandsk','Hongkong','Indiansk','Indisk','Indonesisk','International','Irsk','Israelsk','Italiensk','Japansk','Kantonesisk','Kinesisk','Koreansk','Kroatisk','Latinamerikansk','Libanesisk','Malaysisk','Marokkansk','Mellemamerikansk','Mellemøstlig','Mexicansk','Middelhavsområdet','Moderne','Mongolsk','Nepalesisk','New Zealand','Norditaliensk','Norsk','Pakistansk','Persisk','Peruviansk','Pizza','Portugisisk''Pub','Russisk','Schweizisk','Siciliansk','Singaporeansk','Skandinavisk','Spansk','Specialiteter fra Beijing','Spisested','Steakhouse','Sund','Supper','Sushi','Svensk','Sydamerikansk','Syditaliensk','Szechuan','Taiwansk','Thai','Tibetansk','Toscansk','Tyrkisk','Tysk','Venezuelansk','Vietnamesisk','Vinstue','Xinjiang','Ølpub','Østeuropæisk','Østrigsk']
most_pop = ['Café', "Italiensk", "Pizza", "Indisk", "Fisk og skaldyr", "Thai", "Steakhouse", "Mexikansk"] # gathered from TripAdvisor

link_list = new_trip_df["Type of food link"]
link_df = pd.DataFrame(link_list)

list_test = [] #list of list over kitchens
for link in link_list:
    list_test.append([x for x in køkken_list if x in link])
    
link_test2 = []
for link in link_list:
    link_test2.append([x for x in most_pop if x in link])
link_test2

new_trip_df["Pop kitchen"] = link_test2

TypeError: argument of type 'float' is not iterable

Calculatin distance to our measure point.

In [12]:
distance_list = []
Kgs_Nytorv = '55.679977,12.5841893' #longitude and latitude for Kongens Nytorv

#calculating distance from nytorv to the coordinates in the list
def distance(x):
    Start = new_trip_df["Location"][x]
    Stop = Kgs_Nytorv
    distance_list.append(great_circle(Start, Stop).meters)
    
for x in trip_df.index:
    distance(x)
    
#appending to df 
new_trip_df["Distance from Kgs. Nytorv (m)"] = distance_list 
new_trip_df['Distance from Kgs. Nytorv (m)'] = new_trip_df['Distance from Kgs. Nytorv (m)'].round()

In [13]:
new_trip_df['Postal code'] = [str(x).split('København')[-1].split('Danmark')[0] for x in new_trip_df['Address']]

In [14]:
#trip_df = trip_df.sort_values(by='Ranking on list', ascending=True)
new_trip_df.to_csv("Tripadvisordata_final.csv", index=False)
