## Importing packages and attributes

In [1]:
import pandas as pd
import re
from geopy.distance import great_circle

## Data Processing

In this notebook we will process the raw data scraped in our *`TripAdvisor_scraper.ipynb`* We start by loading the CSV file, we created.

In [2]:
trip_df = pd.read_csv("Tripadvisordata_raw.csv")
trip_df.tail() # prints just to check if the read went well

Unnamed: 0,God pris,Mad,Restaurant,Service,Stemning,Location,Number of reviews,Price class,Main rating,Ranking on list,Price range,Type of food,Address
95,45,50,Alchemist,50,,"55.693714,12.613528",68,--$$$$---,50,108,2.501,"<div class=""header_links""><a href=""/Restaurant...","Refshalevej 173C, København 1432 Danmark"
96,45,45,District Tonkin - Bánh Mí,40,,"55.683544,12.587593",212,--$---------,45,113,121.0,"<div class=""header_links""><a href=""/Restaurant...","Dronningens Tvaergade 12, København 1302 Danmark"
97,40,45,Restaurant Kanalen,40,45.0,"55.67551,12.593101",416,--$$$$---,40,122,,"<div class=""header_links""><a href=""/Restaurant...","Wilders Plads 2, København 1403 Danmark"
98,45,45,Sidecar,45,,"55.687107,12.549084",132,--$$-$$$------,45,107,295.0,"<div class=""header_links""><a href=""/Restaurant...","Skyttegade 5, København 2200 Danmark"
99,45,45,56°,45,50.0,"55.684166,12.610067",208,--$$$$------,45,120,248.0,"<div class=""header_links""><a href=""/Restaurant...","Krudtløbsvej 8, København 1439 Danmark"


### Restructuring data

In [3]:
distance_list = []
Kgs_Nytorv = '55.679977,12.5841893' #longitude and latitude for Kongens Nytorv

#calculating distance from nytorv to the coordinates in the list
def distance(x):
    Start = trip_df["Location"][x]
    Stop = Kgs_Nytorv
    distance_list.append(great_circle(Start, Stop).meters)
    
for x in trip_df.index:
    distance(x)
    
#appending to df 
trip_df["Distance from Kgs. Nytorv (m)"] = distance_list 

In [4]:
trip_df = trip_df.replace(regex=['&amp;'], value='&')
trip_df['Main rating'] = trip_df['Main rating'].replace(regex=[','], value='.')
trip_df['Distance from Kgs. Nytorv (m)'] = trip_df['Distance from Kgs. Nytorv (m)'].round()
trip_df['Good price'] = trip_df['God pris'] / 10
trip_df['Food'] = trip_df['Mad'] / 10
trip_df['Service'] = trip_df['Service'] / 10
trip_df['Atmosphere'] = trip_df['Stemning'] / 10
trip_df = trip_df[['Restaurant', 'Main rating', 'Ranking on list', 'Price range', 'Price class', 'Location', 'Distance from Kgs. Nytorv (m)' , 'Good price', 'Food', 'Service', 'Atmosphere', 'Type of food', 'Number of reviews', 'Address']]

In [5]:
trip_df.head()

Unnamed: 0,Restaurant,Main rating,Ranking on list,Price range,Price class,Location,Distance from Kgs. Nytorv (m),Good price,Food,Service,Atmosphere,Type of food,Number of reviews,Address
0,Restaurant Grønnegade,4.5,28,664.0,--$$$$---,"55.681705,12.583386",199.0,4.5,4.5,4.5,4.5,"<div class=""header_links""><a href=""/Restaurant...",525,"Grønnegade 39, København 1107 Danmark"
1,Basso København,4.5,18,496.0,--$$-$$$---------,"55.68388,12.58657",459.0,4.5,4.5,4.5,,"<div class=""header_links""><a href=""/Restaurant...",345,"Dronningens Tvaergade 22, København 1302 Danmark"
2,Enomania,4.5,1,,--$$-$$$------,"55.670773,12.531963",3431.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",245,"Vesterbrogade 187, Frederiksberg, København 18..."
3,Restaurant Krebsegaarden,5.0,2,396.0,--$$$$---,"55.67872,12.569877",908.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",1311,"Studiestraede 17, København 1455 Danmark"
4,The Pescatarian,5.0,6,597.0,--$$$$------,"55.68799,12.596316",1171.0,4.5,5.0,5.0,,"<div class=""header_links""><a href=""/Restaurant...",201,Amaliegade 49 On the corner of Amaliegade & Es...


In [6]:
trip_df = trip_df.sort_values(by='Ranking on list', ascending=True)

trip_df.to_csv("Tripadvisordata_200.csv", index=False)


In [17]:
trip_df.head(30)

Unnamed: 0,Restaurant,Main rating,Ranking on list,Price range,Price class,Location,Distance from Kgs. Nytorv (m),Good price,Food,Service,Atmosphere,Type of food,Number of reviews,Address,Postal code
22,Grams Lækkerier,5.0,1,101.0,--$---------,"55.671036,12.562724",1673.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",721,"Halmtorvet 1, København 1700 Danmark",11700
2,Enomania,4.5,1,,--$$-$$$------,"55.670773,12.531963",3431.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",245,"Vesterbrogade 187, Frederiksberg, København 18...","187,,1800"
3,Restaurant Krebsegaarden,5.0,2,396.0,--$$$$---,"55.67872,12.569877",908.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",1311,"Studiestraede 17, København 1455 Danmark",171455
6,Mielcke & Hurtigkarl,4.5,2,2.615,--$$$$---,"55.67468,12.530938",3390.0,4.5,4.5,4.5,4.5,"<div class=""header_links""><a href=""/Restaurant...",253,"Fr.Berg Runddel 1, Frederiksberg, København 20...","1,,2000"
20,Frederiks Have,4.5,3,536.0,--$$$$---,"55.679256,12.524991",3712.0,4.5,4.5,4.5,4.5,"<div class=""header_links""><a href=""/Restaurant...",529,"Smallegade 41, Frederiksberg, København 2000 D...","41,,2000"
14,The Olive Kitchen & Bar,5.0,3,174.0,--$$-$$$---------,"55.68125,12.571284",821.0,4.5,5.0,5.0,,"<div class=""header_links""><a href=""/Restaurant...",2141,"Nørregade 22, København 1165 Danmark",221165
17,Søllerød Kro,5.0,4,1.797,--$$$$------,"55.813576,12.495031",15869.0,4.5,5.0,5.0,5.0,"<div class=""header_links""><a href=""/Restaurant...",337,"Soelleroedvej 35 DK-2840 Holte, København 2840...",3528402840
29,Zahida at Kellerdirk,5.0,5,248.0,--$$-$$$---------,"55.67162,12.546196",2557.0,4.5,5.0,5.0,,"<div class=""header_links""><a href=""/Restaurant...",879,Frederiksberg Allé 102 We are located inside t...,1021820
32,Bastard Café,4.5,6,,--$------,"55.67645,12.574973",698.0,4.5,4.0,4.5,,"<div class=""header_links""><a href=""/Restaurant...",242,"Rådhusstræde 13, København 1466 Danmark",131466
4,The Pescatarian,5.0,6,597.0,--$$$$------,"55.68799,12.596316",1171.0,4.5,5.0,5.0,,"<div class=""header_links""><a href=""/Restaurant...",201,Amaliegade 49 On the corner of Amaliegade & Es...,491256


In [25]:
trip_df['Postal code'] = [str(x).split('København')[1].split('Danmark')[0] for x in trip_df['Address']]
trip_df.head()

Unnamed: 0,Restaurant,Main rating,Ranking on list,Price range,Price class,Location,Distance from Kgs. Nytorv (m),Good price,Food,Service,Atmosphere,Type of food,Number of reviews,Address,Postal code
22,Grams Lækkerier,5.0,1,101.0,--$---------,"55.671036,12.562724",1673.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",721,"Halmtorvet 1, København 1700 Danmark",1700
2,Enomania,4.5,1,,--$$-$$$------,"55.670773,12.531963",3431.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",245,"Vesterbrogade 187, Frederiksberg, København 18...",1800
3,Restaurant Krebsegaarden,5.0,2,396.0,--$$$$---,"55.67872,12.569877",908.0,4.5,5.0,5.0,4.5,"<div class=""header_links""><a href=""/Restaurant...",1311,"Studiestraede 17, København 1455 Danmark",1455
6,Mielcke & Hurtigkarl,4.5,2,2.615,--$$$$---,"55.67468,12.530938",3390.0,4.5,4.5,4.5,4.5,"<div class=""header_links""><a href=""/Restaurant...",253,"Fr.Berg Runddel 1, Frederiksberg, København 20...",2000
20,Frederiks Have,4.5,3,536.0,--$$$$---,"55.679256,12.524991",3712.0,4.5,4.5,4.5,4.5,"<div class=""header_links""><a href=""/Restaurant...",529,"Smallegade 41, Frederiksberg, København 2000 D...",2000
