In [2]:
import os
import requests
import json
import re
import sys
import lxml.html 
from lxml import html
from lxml import etree
import csv
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

print('All required libraries have been successfully imported.')

All required libraries have been successfully imported.


In [3]:
def download_only_page(url, filename):
    """""
    Input: 
    -----
        url- URL of the webpage to be downloaded to local file system.
        filename- Desired filename to be associated with the downloaded webpage.
    
    Output:
    ------
        return nothing.
    
    Functionality:
    -------------
        Request for the webpage based on URL and download it to a local directory.
    """""
    # Request to download the html page
    r = requests.get(url)
    
    # Save downloaded page as a text file
    with open(filename, mode='wb') as file:     
        file.write(r.content)

    print('TripAdvisor- hotel HTML page downloaded successfully..')

In [4]:
def download_html_page(url, count_file):
    """""
    Input: 
    -----
        url- URL of the webpage to be downloaded to local file system.
        count_file- file number or file count, used to create a filename to be stored in local directory.
    
    Output:
    ------
        return lxml.etree instance(html) and the updated count aka file number.
    
    Functionality:
    -------------
        - Request for the webpage based on URL and download it to a local directory.
        - Create lxml.etree instance of the webpage downloaded, inorder to help parse it.
    """""
     # Request to download the html page
    r = requests.get(url)
    count_file += 1
    # Save downloaded page as a text file
    filename = 'file:///home/atharvas/Desktop/NIAGARA.html'+ str(count_file) + '.txt'
    with open(filename, mode='wb') as file:     
        file.write(r.content)

    print('TripAdvisor- hotel HTML page downloaded successfully..')
    
    # Open saved file to parse it.
    with open(filename,'r') as fileread:
        html = etree.HTML(fileread.read())
    
    # Parse the HTML page as a tree structure
    result = etree.tostring(html, pretty_print=True, method="html")
    print('File read successfully..')
    
    return html, count_file

In [5]:
def read_parse_file(filename):
    """""
    Input: 
    -----
        filename- Filename used to store the desired webpage in local directory.
    
    Output:
    ------
        return lxml.etree instance(html) and the updated count/file number.
    
    Functionality:
    -------------
        - Read the desired file from local directory.
        - Create lxml.etree instance of the webpage downloaded inorder to help parse it.
    """""
    # Open saved file to parse it.
    with open(filename,'r') as fileread:
        html = etree.HTML(fileread.read())
    
    # Parse the HTML page as a tree structure
    result = etree.tostring(html, pretty_print=True, method="html")
    print('File read successfully..')
    
    return html

In [6]:
def get_hotel_url(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only exact search results for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
                
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div[1]/@data-url')

        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')
        
        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [7]:
def get_hotel_url_related(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only related search results (if exact results are nil, this will be called)
          for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
        value = iter_val + 1
    
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div/div[' + str(value) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div[' + str(value) + ']/div/div[1]/@data-url')

        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [8]:
def get_hotel_url_nomatches(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only other search results (if exact results and related results for a page are nil, 
          this will be called) for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
    
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH =  html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')

        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [9]:
# Extract related content from the tree using XPath for the MainPage of TripAdvisor hotels canada.
count_file = 0

# Request html page
url = str('https://www.tripadvisor.ca/Hotels-g153339-Canada-Hotels.html')
html, count_file = download_html_page(url, count_file)

# Create xpath to access necessary content
XPATH_MAINPAGE = '//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"]'
hotel_main_page = html.xpath(XPATH_MAINPAGE)

# Get count of elements of interest in html page
count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
# print(count)

# Get the parsed html etree
hotel_url_returned = get_hotel_url(count, html)
print('Content Extracted..')

FileNotFoundError: [Errno 2] No such file or directory: 'file:///home/atharvas/Desktop/NIAGARA.html1.txt'

In [None]:
# # import libraries
# import os
# import requests
# import json
# import re
# import sys
# import lxml.html 
# from lxml import html
# from lxml import etree
# import csv
# import pandas as pd
# import numpy as np
# from geopy.geocoders import Nominatim

# print('All required libraries have been successfully imported.')
# All required libraries have been successfully imported.
# In [4]:
# def download_only_page(url, filename):
#     """""
#     Input: 
#     -----
#         url- URL of the webpage to be downloaded to local file system.
#         filename- Desired filename to be associated with the downloaded webpage.
    
#     Output:
#     ------
#         return nothing.
    
#     Functionality:
#     -------------
#         Request for the webpage based on URL and download it to a local directory.
#     """""
#     # Request to download the html page
#     r = requests.get(url)
    
#     # Save downloaded page as a text file
#     with open(filename, mode='wb') as file:     
#         file.write(r.content)

#     print('TripAdvisor- hotel HTML page downloaded successfully..')
# In [5]:
# def download_html_page(url, count_file):
#     """""
#     Input: 
#     -----
#         url- URL of the webpage to be downloaded to local file system.
#         count_file- file number or file count, used to create a filename to be stored in local directory.
    
#     Output:
#     ------
#         return lxml.etree instance(html) and the updated count aka file number.
    
#     Functionality:
#     -------------
#         - Request for the webpage based on URL and download it to a local directory.
#         - Create lxml.etree instance of the webpage downloaded, inorder to help parse it.
#     """""
#      # Request to download the html page
#     r = requests.get(url)
#     count_file += 1
#     # Save downloaded page as a text file
#     filename = 'Project-Dataset/tripadvisor_canada_hotels'+ str(count_file) + '.txt'
#     with open(filename, mode='wb') as file:     
#         file.write(r.content)

#     print('TripAdvisor- hotel HTML page downloaded successfully..')
    
#     # Open saved file to parse it.
#     with open(filename,'r') as fileread:
#         html = etree.HTML(fileread.read())
    
#     # Parse the HTML page as a tree structure
#     result = etree.tostring(html, pretty_print=True, method="html")
#     print('File read successfully..')
    
#     return html, count_file
# In [8]:
# def read_parse_file(filename):
#     """""
#     Input: 
#     -----
#         filename- Filename used to store the desired webpage in local directory.
    
#     Output:
#     ------
#         return lxml.etree instance(html) and the updated count/file number.
    
#     Functionality:
#     -------------
#         - Read the desired file from local directory.
#         - Create lxml.etree instance of the webpage downloaded inorder to help parse it.
#     """""
#     # Open saved file to parse it.
#     with open(filename,'r') as fileread:
#         html = etree.HTML(fileread.read())
    
#     # Parse the HTML page as a tree structure
#     result = etree.tostring(html, pretty_print=True, method="html")
#     print('File read successfully..')
    
#     return html
# In [7]:
# def get_hotel_url(count, html):
#     """""
#     Input: 
#     -----
#         count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
#         html- lxml.etree instance(html), used to parse the file.
    
#     Output:
#     ------
#         return the extracted full hotel URL(hotel_url) from the base webpage.
    
#     Functionality:
#     -------------
#         - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
#         - This function considers only exact search results for a given category of hotels.
#     """""
#     hotel_url = []
#     double_count = 0
#     num_ad = 2 # to counter ads intbetween
#     iter_val = 0
    
#     # Parse html page to get the urls for nested webpages.
#     for element in range(int(count)):
#         iter_val = element + 1
#         # Adjust numbering based on webpage structure.
#         if iter_val <= 4 :
#             iter_val = iter_val
#         elif iter_val == int(count):
#             iter_val += int(double_count * num_ad) 
#         else:
#             if (iter_val % 5 == 0) & (iter_val != int(count)):
#                 double_count += 1
#                 iter_val += int(double_count * num_ad)
#             else:
#                 iter_val += int(double_count * num_ad)
                
#         # Use xpath to retrieve the necessary content.
#         XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div[1]/@data-url')

#         if str(XPATH)[2:-2] == '':
#             XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
#         if str(XPATH)[2:-2] == '':
#             XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')
        
#         hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
#     return hotel_url
# In [8]:
# def get_hotel_url_related(count, html):
#     """""
#     Input: 
#     -----
#         count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
#         html- lxml.etree instance(html), used to parse the file.
    
#     Output:
#     ------
#         return the extracted full hotel URL(hotel_url) from the base webpage.
    
#     Functionality:
#     -------------
#         - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
#         - This function considers only related search results (if exact results are nil, this will be called)
#           for a given category of hotels.
#     """""
#     hotel_url = []
#     double_count = 0
#     num_ad = 2 # to counter ads intbetween
#     iter_val = 0
    
#     # Parse html page to get the urls for nested webpages.
#     for element in range(int(count)):
#         iter_val = element + 1
#         # Adjust numbering based on webpage structure.
#         if iter_val <= 4 :
#             iter_val = iter_val
#         elif iter_val == int(count):
#             iter_val += int(double_count * num_ad) 
#         else:
#             if (iter_val % 5 == 0) & (iter_val != int(count)):
#                 double_count += 1
#                 iter_val += int(double_count * num_ad)
#             else:
#                 iter_val += int(double_count * num_ad)
#         value = iter_val + 1
    
#         # Use xpath to retrieve the necessary content.
#         XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div/div[' + str(value) + ']/div/div[1]/@data-url')
        
#         if str(XPATH)[2:-2] == '':
#             XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div[' + str(value) + ']/div/div[1]/@data-url')

#         hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
#     return hotel_url
# In [9]:
# def get_hotel_url_nomatches(count, html):
#     """""
#     Input: 
#     -----
#         count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
#         html- lxml.etree instance(html), used to parse the file.
    
#     Output:
#     ------
#         return the extracted full hotel URL(hotel_url) from the base webpage.
    
#     Functionality:
#     -------------
#         - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
#         - This function considers only other search results (if exact results and related results for a page are nil, 
#           this will be called) for a given category of hotels.
#     """""
#     hotel_url = []
#     double_count = 0
#     num_ad = 2 # to counter ads intbetween
#     iter_val = 0
    
#     # Parse html page to get the urls for nested webpages.
#     for element in range(int(count)):
#         iter_val = element + 1
#         # Adjust numbering based on webpage structure.
#         if iter_val <= 4 :
#             iter_val = iter_val
#         elif iter_val == int(count):
#             iter_val += int(double_count * num_ad) 
#         else:
#             if (iter_val % 5 == 0) & (iter_val != int(count)):
#                 double_count += 1
#                 iter_val += int(double_count * num_ad)
#             else:
#                 iter_val += int(double_count * num_ad)
    
#         # Use xpath to retrieve the necessary content.
#         XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
#         if str(XPATH)[2:-2] == '':
#             XPATH =  html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')

#         hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
#     return hotel_url
# In [8]:
# # Extract related content from the tree using XPath for the MainPage of TripAdvisor hotels canada.
# count_file = 0

# # Request html page
# url = str('https://www.tripadvisor.ca/Hotels-g153339-Canada-Hotels.html')
# html, count_file = download_html_page(url, count_file)

# # Create xpath to access necessary content
# XPATH_MAINPAGE = '//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"]'
# hotel_main_page = html.xpath(XPATH_MAINPAGE)

# # Get count of elements of interest in html page
# count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
# # print(count)

# # Get the parsed html etree
# hotel_url_returned = get_hotel_url(count, html)
# print('Content Extracted..')
# TripAdvisor- hotel HTML page downloaded successfully..
# File read successfully..
# Content Extracted..
# In [9]:
# # Convert list to dataframe
# hotel_url_df = pd.DataFrame(np.array(hotel_url_returned))
# print(hotel_url_df[:30])
# print('30 Hotel URLs retrieved..')

# #  Write to csv file
# hotel_url_df.to_csv('Project-Dataset/final-data/file_mainpage_url.txt', index=False)
# print('Written on to file..')

In [None]:
#!/usr/bin/env python
from datetime import datetime
from time import time
from lxml import html,etree
import requests,re
import os,sys
import unicodecsv as csv
import argparse

def parse(locality,checkin_date,checkout_date,sort):
    checkIn = checkin_date.strftime("%Y/%m/%d")
    checkOut = checkout_date.strftime("%Y/%m/%d")
    print ("Scraper Inititated for Locality:%s"%locality)
    # TA rendering the autocomplete list using this API
    print ("Finding search result page URL")
    geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
    api_response  = requests.get(geo_url, verify=False).json()
    #getting the TA url for th equery from the autocomplete response
    url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
    print ('URL found %s'%url_from_autocomplete)
    geo = api_response['results'][0]['value']   
    #Formating date for writing to file 
    
    date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
    #form data to get the hotels list from TA for the selected date
    form_data = {'changeSet': 'TRAVEL_INFO',
            'showSnippets': 'false',
            'staydates':date,
            'uguests': '2',
            'sortOrder':sort
    }
    #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
    headers = {
                            'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
                            'Accept-Encoding': 'gzip,deflate',
                            'Accept-Language': 'en-US,en;q=0.5',
                            'Cache-Control': 'no-cache',
                            'Connection': 'keep-alive',
                            'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
                            'Host': 'www.tripadvisor.com',
                            'Pragma': 'no-cache',
                            'Referer': url_from_autocomplete,
                            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
                            'X-Requested-With': 'XMLHttpRequest'
                        }
    cookies=  {"SetCurrency":"USD"}
    print ("Downloading search results page")
    page_response  = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
    print( "Parsing results ")
    parser = html.fromstring(page_response.text)
    hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
    hotel_data = []
    if not hotel_lists:
        hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')

    for hotel in hotel_lists:
        XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
        XPATH_REVIEWS  = './/a[@class="review_count"]//text()'
        XPATH_RANK = './/div[@class="popRanking"]//text()'
        XPATH_RATING = './/span[contains(@class,"rating")]/@alt'
        XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
        XPATH_HOTEL_FEATURES = './/div[contains(@class,"common_hotel_icons_list")]//li//text()'
        XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
        XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()' 
        XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()'

        raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
        raw_no_of_deals =  hotel.xpath(XPATH_VIEW_DEALS)
        raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
        raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
        raw_rank = hotel.xpath(XPATH_RANK)
        raw_rating = hotel.xpath(XPATH_RATING)
        raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
        raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
        raw_hotel_price_per_night  = hotel.xpath(XPATH_HOTEL_PRICE)

        url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else  None
        reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0 
        rank = ''.join(raw_rank) if raw_rank else None
        rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
        name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
        hotel_features = ','.join(raw_hotel_features)
        price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
        no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
        booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None

        if no_of_deals:
            no_of_deals = no_of_deals[0]
        else:
            no_of_deals = 0
            
        data = {
                    'hotel_name':name,
                    'url':url,
                    'locality':locality,
                    'reviews':reviews,
                    'tripadvisor_rating':rating,
                    'checkOut':checkOut,
                    'checkIn':checkIn,
                    'hotel_features':hotel_features,
                    'price_per_night':price_per_night,
                    'no_of_deals':no_of_deals,
                    'booking_provider':booking_provider

        }
        hotel_data.append(data)
    return hotel_data

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('checkin_date',help = 'Hotel Check In Date (Format: YYYY/MM/DD')
    parser.add_argument('checkout_date',help = 'Hotel Chek Out Date (Format: YYYY/MM/DD)')
    sortorder_help = """
    available sort orders are :\n
    priceLow - hotels with lowest price,
    distLow : Hotels located near to the search center,
    recommended: highest rated hotels based on traveler reviews,
    popularity :Most popular hotels as chosen by Tipadvisor users 
    """
    parser.add_argument('sort',help = sortorder_help,default ='popularity ')
    parser.add_argument('locality',help = 'Search Locality')
    args = parser.parse_args()
    locality = args.locality
    checkin_date = datetime.strptime(args.checkin_date,"%Y/%m/%d")
    checkout_date = datetime.strptime(args.checkout_date,"%Y/%m/%d")
    sort= args.sort
    checkIn = checkin_date.strftime("%Y/%m/%d")
    checkOut = checkout_date.strftime("%Y/%m/%d")
    today = datetime.now()
   
    if today<datetime.strptime(checkIn,"%Y/%m/%d") and datetime.strptime(checkIn,"%Y/%m/%d")<datetime.strptime(checkOut,"%Y/%m/%d"):
        data = parse(locality,checkin_date,checkout_date,sort)
        print ("Writing to output file tripadvisor_data.csv")
        with open('tripadvisor_data.csv','w')as csvfile:
            fieldnames = ['hotel_name','url','locality','reviews','tripadvisor_rating','checkIn','checkOut','price_per_night','booking_provider','no_of_deals','hotel_features']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in  data:
                writer.writerow(row)
    #checking whether the entered date is already passed
    elif today>datetime.strptime(checkIn,"%Y/%m/%d") or today>datetime.strptime(checkOut,"%Y/%m/%d"):
        print ("Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed")
    
    elif datetime.strptime(checkIn,"%Y/%m/%d")>datetime.strptime(checkOut,"%Y/%m/%d"):
        print ("Invalid Checkin date: CheckIn date must be less than checkOut date")

In [None]:
import re
import selenium
import io
import requests
import bs4
import urllib.request
import urllib.parse
from selenium import webdriver
import csv
from selenium.webdriver.common.action_chains import ActionChains
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys

options=webdriver.ChromeOptions()
options.headless=False
prefs={"profile.default_content_setting_values.notofications" :2}
options.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome("D:\Python Exercise\chromedriver.exe")
driver.maximize_window()
time.sleep(5)
driver.get("https://www.tripadvisor.in/")
#driver.find_element_by_id("brand-quick-links-QuickLinkTileItem__link--1k5lE").click()
#driver.find_element_by_id("userId").send_keys(email)
#driver.find_element_by_id("pwd").send_keys(pswd)
time.sleep(20)

#driver.find_element_by_xpath('//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/span/div[2]').click()
#time.sleep(5)
driver.find_element_by_xpath('//*[@id="component_4"]/div/div/div/span[1]/div/div/a').click()
driver.find_element_by_xpath('//*[@id="c_targeted_flyout_1"]/div/div/div[1]/div[1]/div/input').send_keys("kolkata",Keys.ENTER)
time.sleep(1)

url = driver.current_url
print(url)

responce=requests.get(url)
responce=responce.text
data=bs4.BeautifulSoup(responce,'lxml')

read1=data.select(".listing_title")
# print(len(read1))
# print(read1[0].text)

read2=data.select(".price-wrap ")
price=[]
for i in read2:
    x=i.text
    x=x.replace("₹\xa0"," ")
    x=x.lstrip()
    x=x.split(" ")

    if (len(x)>1):
        price.append(str(x[1]))
    else:
        price.append(str(x[0]))
print(price)


name=[]
for i in range(len(read1)):
    x=read1[i].text
    name.append(x)


import pandas as pd
df=pd.DataFrame(name,price)
print (df)

In [1]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim()
city ="Alappuzha"
country ="India"
loc = geolocator.geocode(city+','+ country)
print("latitude is :-" ,loc.latitude,"\nlongtitude is:-" ,loc.longitude)

  


latitude is :- 9.48870055 
longtitude is:- 76.41256410969626


In [2]:
from  geopy.geocoders import Nominatim


In [3]:
locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("North Aryad,Alappuzha, India")

print("Latitude = {}, Longitude = {}".format(location.latitude, location.longitude))


Latitude = 9.5401618, Longitude = 76.33118867409746


In [5]:
location = locator.geocode("Alappuzha")
print("Latitude = {}, Longitude = {}".format(location.latitude, location.longitude))

Latitude = 9.48870055, Longitude = 76.41256410969626
