## Importing packages and attributes

In [1]:
import scraping_class
import pandas as pd
import requests,os,time, re
from datetime import datetime
from time import gmtime, strftime
import bs4 as bs
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import urllib.request
from geopy.distance import great_circle
#import uuid

## Running of connector for HTML scraper - Created by Snorre Rasund

In [2]:
def ratelimit():
    "A function that handles the rate of your calls."
    time.sleep(0.5) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
    """This Class implements a method for reliable connection to the internet and monitoring.
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessment
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case).
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type

    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile\n",
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
           
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
  
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
 
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit()
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\\n'+';'.join(map(str,row))) # write log.
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\\n'+';'.join(map(str,row))) # write row to log.
    else:
      t = time.time()
      ratelimit()
      self.browser.get(url) # use selenium get method
      ## log
      call_id = self.id # define unique identifier for the call.
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\\n'+';'.join(map(str,row))) # write row to log file.
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
    ## connector.browser.page_source will give you the html.
      return call_id

logfile="tripadvisor_scraper.txt" # name your log file.
connector = Connector(logfile)

## HTML scraper - 1 step
The below code gets an overview of all the restaurant in the Copenhagen area

In [3]:
url_init = 'https://www.tripadvisor.dk' # TripAdvisor base url
max_restaurant = 2324 # Restaurants stated on he webpage, to be located in Copenhangen area. If this isn't given, the loop will continue
init_number = 0 
final_urls = []
year = datetime.now().year
month = datetime.now().month
day = datetime.now().day


while init_number <= max_restaurant:
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # Creates a timestamp in the format yyyy-mm-dd h:m:s
    
    url_search = '/RestaurantSearch-g189541-o' # first part of url
    url_search2 = '-a_date.' + str(year) + '__2D__' + str(month) + '__2D__' + str(day) + '-a_people.2-a_time.20%3A00%3A00-a_zur.' + str(year) + '__5F__' + str(month) + '__5F__' + str(day) + '-Copenhag.html' # second part of url
    url_search3 = url_init + url_search + str(init_number) + url_search2 # combined dynamic url
    data, call_id = connector.get(url_search3, 'exam_init') # gathering data
    response = data.ok # checks if datastream is possible
    final_urls.append(url_search3)
    if response == True:
        status = 'Ok'

    print('Status:', status , 'Timestamp:', timestamp, 'links processed:', len(final_urls)) # prints response, timestamp and amount of links processed.
    init_number += 30 # incriments the number by 30 to be put into the url on next call."

Status: Ok Timestamp: 2019-08-27 17:43:51 links processed: 1
Status: Ok Timestamp: 2019-08-27 17:43:53 links processed: 2
Status: Ok Timestamp: 2019-08-27 17:43:54 links processed: 3
Status: Ok Timestamp: 2019-08-27 17:43:56 links processed: 4
Status: Ok Timestamp: 2019-08-27 17:43:58 links processed: 5
Status: Ok Timestamp: 2019-08-27 17:43:59 links processed: 6
Status: Ok Timestamp: 2019-08-27 17:44:01 links processed: 7
Status: Ok Timestamp: 2019-08-27 17:44:03 links processed: 8
Status: Ok Timestamp: 2019-08-27 17:44:04 links processed: 9
Status: Ok Timestamp: 2019-08-27 17:44:06 links processed: 10
Status: Ok Timestamp: 2019-08-27 17:44:08 links processed: 11
Status: Ok Timestamp: 2019-08-27 17:44:09 links processed: 12
Status: Ok Timestamp: 2019-08-27 17:44:12 links processed: 13
Status: Ok Timestamp: 2019-08-27 17:44:14 links processed: 14
Status: Ok Timestamp: 2019-08-27 17:44:16 links processed: 15
Status: Ok Timestamp: 2019-08-27 17:44:17 links processed: 16
Status: Ok Timest

### Saving to CSV file

In [4]:
overview_urls = pd.DataFrame(final_urls)
#pd.options.display.max_colwidth = 200
overview_urls.columns = ['Init_links']
overview_urls.to_csv('overview_urls.csv', index = None, header = True) # writes data from df to csv file a

## HTML scraper - 2 step
The code below takes the links generated in step 1 as input, and gives us the individual links to all the restaurants in the Copenhagen area.

In [15]:
final_urls = pd.read_csv("overview_urls.csv")

In [16]:
page_list = []

for url in final_urls['Init_links'][:]: # loops over all overview sites
    response,call_id = connector.get(url,'exam_links')
    link_locations = response.text.split('href="')[1:] # find all links on site 
    link_list = [] 
    
    for i in link_locations:
        if "Restaurant_Review" in i: # finds all links named something with Restaurant review
            link_list.append(i.partition(" ")[0])

        review_list=  []
    for i in link_list:
        if "Restaurant_Review" in i and "#REVIEWS" not in i and "button" not in i: # same links appart from this removes two alternative versions and only stores one.
            review_list.append(i)

            review_list_u = set(review_list) # remove dublicates

    for i in review_list_u:
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # Creates a timestamp in the format yyyy-mm-dd h:m:s
        page_list.append(url_init+i) # make list of final urls
        if response == True:
            status = 'Ok'
        
        print('Status:', status , 'Timestamp:', timestamp, 'links processed:', len(page_list)) # prints response, timestamp and amount of links processed.   

Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 1
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 2
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 3
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 4
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 5
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 6
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 7
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 8
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 9
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 10
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 11
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 12
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 13
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 14
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 15
Status: Ok Timestamp: 2019-08-27 18:03:11 links processed: 16
Status: Ok Timest

In [14]:
indvidual_urls = pd.DataFrame(page_list)

Unnamed: 0,0
0,https://www.tripadvisor.dk/Restaurant_Review-g...
1,https://www.tripadvisor.dk/Restaurant_Review-g...
2,https://www.tripadvisor.dk/Restaurant_Review-g...
3,https://www.tripadvisor.dk/Restaurant_Review-g...
4,https://www.tripadvisor.dk/Restaurant_Review-g...
5,https://www.tripadvisor.dk/Restaurant_Review-g...
6,https://www.tripadvisor.dk/Restaurant_Review-g...
7,https://www.tripadvisor.dk/Restaurant_Review-g...
8,https://www.tripadvisor.dk/Restaurant_Review-g...
9,https://www.tripadvisor.dk/Restaurant_Review-g...


In [54]:
indvidual_urls.columns = ['Restaurant_links']
indvidual_urls.to_csv('indvidual_urls.csv', index = None, header = True) # writes df to csv file

### HTML scraper - step 3
The below code takes the links for step 2 as input and retreives all the data that we ask for, for the individual restaurants

In [65]:
# loading urls from csv
df_urls = pd.read_csv("indvidual_urls.csv")
ratings_df = pd.DataFrame()


# Creating empty lists to store data in
loc_list = []
reviewCount_list = []
distance_list = []
unique_list = []
price_class_list = []
main_rating_list = []
ranking_list = []
price_class_value_list = []
type_food_list = []
extented_adress_list = []

In [69]:
#giant for loop
for url in df_urls["Restaurant_links"][0:1000]:

    trip = ureq(url)
    trip_html = trip.read()
    trip.close()
    trip_soup = soup(trip_html, "lxml")
    if len(trip_soup.text) > 0:
        status = 'Ok'
    else:
        status = 'Failed'
        print(status)
    
    
    test = trip_soup.findAll(True, {"class":["restaurants-detail-overview-cards-RatingsOverviewCard__ratingText--1P1Lq", "restaurants-detail-overview-cards-RatingsOverviewCard__ratingBubbles--1kQYC"]})
    name = trip_soup.findAll(True, {"class":["ui_header h1"]})
    name = str(name)
    name = name[26:]
    name = name.replace("</h1>]", '')
      
    elements = []
    for x in test:
        elements.append(str(x))
    
    keys = elements[0::2]
    values = elements[1::2]
    keys.append("Name")
    values.append(str(name))

    keys[:] = [s.replace('<span class="restaurants-detail-overview-cards-RatingsOverviewCard__ratingText--1P1Lq">', '') for s in keys]
    keys[:] = [s.replace('</span>', '') for s in keys]
    values[:] = [s.replace('<span class="restaurants-detail-overview-cards-RatingsOverviewCard__ratingBubbles--1kQYC"><span class="ui_bubble_rating bubble_', '') for s in values]
    values[:] = [s.replace('"></span></span>', '') for s in values]
    
    ratings_dict = {}
    for i in range(len(keys)):
        ratings_dict[keys[i]] = values[i]
    
    #append
    ratings_df = ratings_df.append([ratings_dict], ignore_index=True)    
    
    #Najas location loop
    p = re.compile(r'"coords":"(.*?)"')
    r = requests.get(url)
    coords = p.findall(r.text)[1]
    loc_list.append(coords)
    
    #Naja review count
    try:
        reviewCount = str(trip_soup.find(class_="reviewCount"))
        reviewCount = reviewCount.split(">")[1].split("<")[0]
        reviewCount_list.append(reviewCount)
    except:
        reviewCount_list.append('NaN')
    
    #Exstracting price_class_number $$$
    try:
        price_class_number = str(trip_soup.find('div', class_="header_links"))
        price_class = re.sub('[^$-]', '', price_class_number)
        price_class_list.append(price_class)
    except:
        price_class_list.append('NaN')

    #Exstracting number of bubbles
    try:
        bubbles = str(trip_soup.find(class_="restaurants-detail-overview-cards-RatingsOverviewCard__overallRating--nohTl"))    
        main_rating = re.sub('[^0-9,.]', '', bubbles) #stripping all other than the ranking numbers
        main_rating_list.append(main_rating)
    except:
        main_rating_list.append('NaN')
    
    #Exstracting list_ranking
    try:
        list_ranking =str(trip_soup.find(class_="restaurants-detail-overview-cards-RatingsOverviewCard__ranking--17CmN").find('span', class_=""))
        ranking = re.sub('[^0-9,]', '', list_ranking)
        ranking_list.append(ranking)
    except:
        ranking_list.append('NaN')
    
    #Exstracting price_class_value
    try:
        price_class_value = str(trip_soup.find(class_="restaurants-detail-overview-cards-DetailsSectionOverviewCard__tagText--1OH6h"))
        price_class_value = re.sub('[^0-9,.-]', '', price_class_value)
        price_class_value_list.append(price_class_value)
    except:
        price_class_value_list.append('NaN')
        
    #Exstracting type of food
    try:
        type_food = str(trip_soup.find(class_="prw_rup prw_restaurants_restaurant_detail_tags tagsContainer").find(class_="header_links"))
        kokken_list = ['Café', 'Afrikansk','Amerikansk','Arabisk','Argentinsk','Armensk','Aserbajdsjansk','Asiatisk','Bar','Belgisk','Brasiliansk','Britisk','Cajun og kreolsk','Cambodjansk','Canadisk','Caribisk','Centralasiatisk','Centraleuropæisk','Centralitaliensk','Dansk','Delikatesseforretning','Egyptisk','Etiopisk','Europæisk','Fastfood','Filippinsk','Fisk og skaldyr','Fra Lazio','Fra Shanghai','Fransk','Fusion','Gademad','Gastropub','Grill','Grillmad','Græsk','Hawaiiansk','Hollandsk','Hongkong','Indiansk','Indisk','Indonesisk','International','Irsk','Israelsk','Italiensk','Japansk','Kantonesisk','Kinesisk','Koreansk','Kroatisk','Latinamerikansk','Libanesisk','Malaysisk','Marokkansk','Mellemamerikansk','Mellemøstlig','Mexicansk','Middelhavsområdet','Moderne','Mongolsk','Nepalesisk','New Zealand','Norditaliensk','Norsk','Pakistansk','Persisk','Peruviansk','Pizza','Portugisisk''Pub','Russisk','Schweizisk','Siciliansk','Singaporeansk','Skandinavisk','Spansk','Specialiteter fra Beijing','Spisested','Steakhouse','Sund','Supper','Sushi','Svensk','Sydamerikansk','Syditaliensk','Szechuan','Taiwansk','Thai','Tibetansk','Toscansk','Tyrkisk','Tysk','Venezuelansk','Vietnamesisk','Vinstue','Xinjiang','Ølpub','Østeuropæisk','Østrigsk']
        for i in kokken_list: 
            if i in type_food:
                type_food = i
        type_food_list.append(type_food)
    except:
        type_food_list.append('NaN')
    
    #Exstracting address
    try:
        extented_adress = str(trip_soup.find(class_="restaurants-detail-overview-cards-LocationOverviewCard__detailLinkText--co3ei"))
        extented_adress = extented_adress.split(">")[1].split("<")[0]
        extented_adress_list.append(extented_adress)
    except:
        extented_adress_list.append('NaN')
    
    
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    print('Status:', status , 'Timestamp:', timestamp, 'pages scraped:', len(ratings_df)) # prints response, timestamp and amount of links processed.
        
    
#out of loop
ratings_df["Location"] = loc_list
ratings_df["Number of reviews"] = reviewCount_list
ratings_df["Price class"] = price_class_list
ratings_df["Main rating"] = main_rating_list
ratings_df["Ranking on list"] = ranking_list
ratings_df["Price range"] = price_class_value_list
ratings_df["Type of food"] = type_food_list
ratings_df["Address"] = extented_adress_list

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Status: Ok Timestamp: 2019-08-28 06:23:40 pages scraped: 11
Status: Ok Timestamp: 2019-08-28 06:23:44 pages scraped: 12
Status: Ok Timestamp: 2019-08-28 06:23:48 pages scraped: 13
Status: Ok Timestamp: 2019-08-28 06:23:51 pages scraped: 14
Status: Ok Timestamp: 2019-08-28 06:23:55 pages scraped: 15
Status: Ok Timestamp: 2019-08-28 06:24:00 pages scraped: 16
Status: Ok Timestamp: 2019-08-28 06:24:06 pages scraped: 17
Status: Ok Timestamp: 2019-08-28 06:24:10 pages scraped: 18
Status: Ok Timestamp: 2019-08-28 06:24:14 pages scraped: 19
Status: Ok Timestamp: 2019-08-28 06:24:18 pages scraped: 20


In [68]:
trip_data = ratings_df.copy()
#trip_data['Type of food']
trip_data

Unnamed: 0,God pris,Mad,Name,Service,Stemning,Location,Number of reviews,Price class,Main rating,Ranking on list,Price range,Type of food,Address
0,35,40,Restaurant BASALT,40,,"55.63971,12.577826",176 anmeldelser,--$$-$$$---,40,206,------1694.-396.,Dansk,"Center Boulevard 5, København 2300 Danmark"
1,45,45,Madenitaly,45,,"55.67844,12.589964",76 anmeldelser,--$---------,45,203,------1647.-107.,Europæisk,"Holbergsgade 22, st. th Nyhavn, København 1057..."
2,45,45,Duck and Cover,45,40.0,"55.671978,12.552464",92 anmeldelser,--$$-$$$,45,190,------16,"<div class=""header_links""><a href=""/Restaurant...","Dannebrogsgade 6, Basement, København Danmark"
3,40,40,Pintxos,40,40.0,"55.684498,12.566181",475 anmeldelser,--$$-$$$---------,40,201,"------16,,",Europæisk,"Nansensgade 63, København 1366 Danmark"
4,40,40,San Marco Junior,45,,"55.67917,12.5336",122 anmeldelser,--$$-$$$------,45,11,------16369.-5.029.,Centraleuropæisk,"Falkoner Allé 10 Over for falkonersalen, Frede..."
5,40,45,FAMO,45,45.0,"55.672592,12.550527",351 anmeldelser,--$$-$$$------,40,195,------16201.-382.,Europæisk,"Saxogade 3, København 1662 Danmark"
6,45,50,Døp,45,40.0,"55.681282,12.576093",93 anmeldelser,--$---,45,186,"------16,",Dansk,Den Økologiske Pølsemand Next to Rundetårn in ...
7,35,40,Restaurant Brdr. Price,40,40.0,"55.683926,12.574471",443 anmeldelser,--$$-$$$---------,40,199,------16201.-503.,Europæisk,"Rosenborggade 15, København 1130 Danmark"
8,35,40,Laundromat Cafe,40,40.0,"55.689873,12.558576",261 anmeldelser,--$$-$$$------,40,200,"------16,,,",Café,"15 Elmegade, København Danmark"
9,45,45,Sale é Pepe,45,45.0,"55.680313,12.521658",97 anmeldelser,--$$-$$$---------,45,10,"------16,",Europæisk,"Ndr. Fasanvej 9, Frederiksberg, København 2000..."


In [220]:
trip_data.to_csv("Tripadvisordata_raw_1200_1300.csv", index=False)

#ratings_df = pd.read_csv("Tripadvisordata_raw_xxx.csv")
#trip_data.head()

In [221]:
#ratings_df = ratings_df.drop(['Distance from Kgs. Nytorv'], axis=1)

In [199]:
ratings_df.head()

Unnamed: 0,God pris,Mad,Name,Service,Stemning,Location,Number of reviews,Price class,Main rating,Ranking on list,Price range
0,40.0,40.0,Bistro Central,45.0,,"55.68168,12.582247",50 anmeldelser,--$$-$$$------,40,973,------1654.-448.
1,35.0,40.0,Louises Fiskebar,45.0,,"55.677296,12.567593",20 anmeldelser,--$$$$---,45,987,------1674.-696.
2,35.0,35.0,Café René,40.0,40.0,"55.676224,12.56407",52 anmeldelser,--$$-$$$---------,40,980,"------16,,,"
3,40.0,45.0,Herkules Pavillonen,45.0,,"55.68627,12.580382",10 anmeldelser,--$$-$$$,45,981,
4,35.0,35.0,Galathea Kroen,35.0,,"55.6766,12.57455",34 anmeldelser,--$$-$$$------,40,964,------16120.-221.


In [200]:
distance_list = []

In [201]:
Kgs_Nytorv = '55.679977,12.5841893' #longitude and latitude for Kongens Nytorv

#calculating distance from nytorv to the coordinates in the list
def distance(x):
    Start = ratings_df["Location"][x]
    Stop = Kgs_Nytorv
    distance_list.append(great_circle(Start, Stop).meters)
    
for x in ratings_df.index:
    distance(x)
    
#appending to df 
ratings_df["Distance from Kgs. Nytorv"] = distance_list 

In [202]:
ratings_df.head()

Unnamed: 0,God pris,Mad,Name,Service,Stemning,Location,Number of reviews,Price class,Main rating,Ranking on list,Price range,Distance from Kgs. Nytorv
0,40.0,40.0,Bistro Central,45.0,,"55.68168,12.582247",50 anmeldelser,--$$-$$$------,40,973,------1654.-448.,225.136269
1,35.0,40.0,Louises Fiskebar,45.0,,"55.677296,12.567593",20 anmeldelser,--$$$$---,45,987,------1674.-696.,1082.378143
2,35.0,35.0,Café René,40.0,40.0,"55.676224,12.56407",52 anmeldelser,--$$-$$$---------,40,980,"------16,,,",1328.647
3,40.0,45.0,Herkules Pavillonen,45.0,,"55.68627,12.580382",10 anmeldelser,--$$-$$$,45,981,,739.334846
4,35.0,35.0,Galathea Kroen,35.0,,"55.6766,12.57455",34 anmeldelser,--$$-$$$------,40,964,------16120.-221.,711.505212


In [203]:
ratings_df = ratings_df[['Name', 'Main rating', 'Ranking on list', 'Price range', 'Price class', 'Number of reviews', 'Location', 'Distance from Kgs. Nytorv' , 'God pris', 'Mad', 'Service', 'Stemning']]
ratings_df = ratings_df.replace(regex=['&amp;'], value='&')
ratings_df['Main rating'] = ratings_df['Main rating'].replace(regex=[','], value='.')
ratings_df_new['Main rating'] = ratings_df_new['Main rating'].replace(regex=[','], value='.')
ratings_df['Distance from Kgs. Nytorv'] = ratings_df['Distance from Kgs. Nytorv'].round()

In [204]:
ratings_df['Good price'] = ratings_df['God pris'] / 10
ratings_df['Food'] = ratings_df['Mad'] / 10
ratings_df['Service'] = ratings_df['Service'] / 10
ratings_df['Atmosphere'] = ratings_df['Stemning'] / 10
ratings_df = ratings_df.drop(['God pris', 'Mad', 'Stemning'], axis=1)

In [205]:
ratings_df.to_csv("Tripadvisordata_1000_1200.csv", index=False)
ratings_df.head()

Unnamed: 0,Name,Main rating,Ranking on list,Price range,Price class,Number of reviews,Location,Distance from Kgs. Nytorv,Service,Good price,Food,Atmosphere
0,Bistro Central,4.0,973,------1654.-448.,--$$-$$$------,50 anmeldelser,"55.68168,12.582247",225.0,4.5,4.0,4.0,
1,Louises Fiskebar,4.5,987,------1674.-696.,--$$$$---,20 anmeldelser,"55.677296,12.567593",1082.0,4.5,3.5,4.0,
2,Café René,4.0,980,"------16,,,",--$$-$$$---------,52 anmeldelser,"55.676224,12.56407",1329.0,4.0,3.5,3.5,4.0
3,Herkules Pavillonen,4.5,981,,--$$-$$$,10 anmeldelser,"55.68627,12.580382",739.0,4.5,4.0,4.5,
4,Galathea Kroen,4.0,964,------16120.-221.,--$$-$$$------,34 anmeldelser,"55.6766,12.57455",712.0,3.5,3.5,3.5,


In [215]:
ratings_df_1 = pd.read_csv("Tripadvisordata_1000.csv")
ratings_df_2 = pd.read_csv("Tripadvisordata_1000_1200.csv")
ratings_df_1 = ratings_df_1.append(ratings_df_2)
ratings_df_1.to_csv("Tripadvisordata_1200.csv", index=False)