# Scrape Visa Requirement -- US Citizen

# 1. Dependencies

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#from splinter import Browser
from bs4 import BeautifulSoup as bs
import requests

import pymongo

In [5]:
# get chromedriver location
!which chromedriver

/usr/local/bin/chromedriver


# 2. Scrap Data

## 2.1 Launch Driver

In [6]:
# launch chromedriver -- get an empty page
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## 2.2 Scrape Wiki Page for Visa Requirements for US Citizens

In [7]:
url = "https://en.wikipedia.org/wiki/Visa_requirements_for_United_States_citizens"
browser.visit(url)

In [94]:
tables = pd.read_html(browser.html)
visa_df = tables[0]

In [95]:
# drop the unnecesary cols
col_to_drop = ['Allowed stay']
visa_df.drop(col_to_drop, axis = 1, inplace = True)

# rename cols
visa_df.rename(columns={'Country':'country',
                        "Visa requirement":'visa_requirement',
                        "Notes (excluding departure fees)":'notes'},
              inplace = True)
visa_df.head()

Unnamed: 0,country,visa_requirement,notes
0,Afghanistan,Visa required[2][3],Visitors born in Afghanistan do not require a ...
1,Albania,Visa not required[5][6],
2,Algeria,Visa required[8][9],Persons may be denied entry if entering with a...
3,Andorra,Visa not required[10],
4,Angola,eVisa[13][14][15],Visitors who have been granted an online pre-v...


In [96]:
# clean up the citation marks
for i in range(len(visa_df)):
    visa_df.iloc[i,1] = visa_df.iloc[i,1].split('[')[0]

In [97]:
visa_df['visa_requirement'].value_counts()

Visa not required                  107
Visa required                       30
Visa on arrival                     19
eVisa / Visa on arrival             18
eVisa                               11
Travel restricted                    1
Entry Permit on arrival              1
Visitor's Permit on arrival          1
Visitor's permit on arrival          1
eVisa / Tourist card on arrival      1
Tourist Card required                1
Electronic Travel Authority          1
Online Visa                          1
Name: visa_requirement, dtype: int64

In [98]:
auto_visa_key_words = ["on arrival", 'eVisa', 'Electronic', 'Online']
restrict_list = ["Tourist Card required", "Travel restricted"]

for i in range(len(visa_df)):
    for word in auto_visa_key_words:
        if word in visa_df['visa_requirement'][i]:
            visa_df['visa_requirement'][i] = "eVisa/Visa on arrival"
            continue
    if visa_df['visa_requirement'][i] in restrict_list:
        visa_df['visa_requirement'][i] = "Travel restricted"

In [99]:
visa_df['visa_requirement'].value_counts()

Visa not required        107
eVisa/Visa on arrival     54
Visa required             30
Travel restricted          2
Name: visa_requirement, dtype: int64

In [87]:
visa_df.head()

Unnamed: 0,country,visa_requirement,notes
0,Afghanistan,Visa required,Visitors born in Afghanistan do not require a ...
1,Albania,Visa not required,
2,Algeria,Visa required,Persons may be denied entry if entering with a...
3,Andorra,Visa not required,
4,Angola,eVisa/Visa on arrival,Visitors who have been granted an online pre-v...


## 2.3 Read the GDP_by_Country.csv file

In [40]:
# The CSV file is in the resources directory
# The source website is https://data.worldbank.org/indicator/ny.gdp.mktp.cd
gdp_filename = "resources\GDP_by_country.csv"
gdp_df = pd.read_csv(gdp_filename)
gdp_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2498883000.0,2390503000.0,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,
1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777811.1,548888895.6,546666677.8,751111191.1,800000044.4,1006667000.0,...,12439090000.0,15856570000.0,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0
2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,70307160000.0,83799500000.0,111790000000.0,128053000000.0,136710000000.0,145712000000.0,116194000000.0,101124000000.0,122124000000.0,105751000000.0
3,Albania,ALB,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,12044210000.0,11926950000.0,12890870000.0,12319780000.0,12776280000.0,13228250000.0,11386930000.0,11861350000.0,13025060000.0,15058880000.0
4,Andorra,AND,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,3660531000.0,3355695000.0,3442063000.0,3164615000.0,3281585000.0,3350736000.0,2811489000.0,2877312000.0,3013387000.0,3236544000.0


In [41]:
gdp_df.drop(columns=["Country Code","Indicator Name","Indicator Code"], inplace=True)
gdp_df.head()

Unnamed: 0,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,,,,,,,,,,...,2498883000.0,2390503000.0,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,
1,Afghanistan,537777811.1,548888895.6,546666677.8,751111191.1,800000044.4,1006667000.0,1400000000.0,1673333000.0,1373333000.0,...,12439090000.0,15856570000.0,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0
2,Angola,,,,,,,,,,...,70307160000.0,83799500000.0,111790000000.0,128053000000.0,136710000000.0,145712000000.0,116194000000.0,101124000000.0,122124000000.0,105751000000.0
3,Albania,,,,,,,,,,...,12044210000.0,11926950000.0,12890870000.0,12319780000.0,12776280000.0,13228250000.0,11386930000.0,11861350000.0,13025060000.0,15058880000.0
4,Andorra,,,,,,,,,,...,3660531000.0,3355695000.0,3442063000.0,3164615000.0,3281585000.0,3350736000.0,2811489000.0,2877312000.0,3013387000.0,3236544000.0


In [42]:
years = np.arange(1960,2018)
years_str = [str(yy) for yy in years]
gdp_df.drop(columns=years_str, inplace=True)
gdp_df.dropna(inplace=True)
gdp_df.rename(columns={"Country Name": "country", "2018":"GDP 2018"}, inplace=True)
gdp_df.set_index("country", inplace=True)
gdp_df.head()

Unnamed: 0_level_0,GDP 2018
country,Unnamed: 1_level_1
Afghanistan,19362970000.0
Angola,105751000000.0
Albania,15058880000.0
Andorra,3236544000.0
Arab World,2781330000000.0


## 2.4 Scrape Brilliant Maps page for Top 100 Tourist Destinations

In [14]:
# URL of page to be scraped
url = "https://brilliantmaps.com/top-100-tourist-destinations/"

# Retrieve page
response = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(response.text, 'html.parser')

# Examine the results, then determine element that contains sought info
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
 <head>
  <meta charset="utf-8"/>
  <meta content="Which city is the most popular international tourist destination? Surprisingly, it's not Paris, London or New York. Click to find out which city came out on top." name="description"/>
  <meta content="noodp,noydir" name="robots"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Top 100 International Tourist Destination Cities By Country – Brilliant Maps
  </title>
  <link href="//fonts.googleapis.com" rel="dns-prefetch"/>
  <link href="https://brilliantmaps.com/feed/" rel="alternate" title="Brilliant Maps » Feed" type="application/rss+xml"/>
  <link href="https://brilliantmaps.com/comments/feed/" rel="alternate" title="Brilliant Maps » Comments Feed" type="application/rss+xml"/>
  <link href="https://brilliantmaps.com/top-100-tourist-destinations/feed/" rel="alternate" title

In [15]:
# Get the Top Destinations table, show the top 20 destinations
destinations = pd.read_html(response.text)
top_destinations=destinations[0]
top_destinations.head(20)

Unnamed: 0,Rank,City,Country,Tourists (Millions)
0,1,Hong Kong,Hong Kong,23.7
1,2,Singapore,Singapore,21.3
2,3,Bangkok,Thailand,15.8
3,4,London,UK,15.5
4,5,Macau,Macau,13.4
5,6,Kuala Lumpur,Malaysia,13.3
6,7,Shenzhen,China,12.1
7,8,New York City,USA,11.6
8,9,Antalya,Turkey,10.3
9,10,Paris,France,9.8


In [35]:
# Note that some countries have more than one city in the Global Top 100
# Find the most popular city to visit in each country
top_cities_by_country = top_destinations.loc[top_destinations.groupby(["Country"])["Rank"].idxmin()]
top_cities_by_country.rename(columns={"Country":"country", "City":"city","Tourists (Millions)":"tourists (mm)"}, inplace=True)
top_cities_by_country.set_index("country", inplace=True)
top_cities_by_country.drop(columns="Rank", inplace=True)
top_cities_by_country.head(20)

Unnamed: 0_level_0,city,tourists (mm)
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,Buenos Aires,3.2
Australia,Sydney,2.7
Austria,Vienna,4.3
Azerbaijan,Baku,1.7
Bahrain,Manama,2.0
Belgium,Brussels,2.3
Brazil,Rio de Janeiro,1.8
Bulgaria,Sofia,4.1
Cambodia,Siem Reap,1.9
Canada,Toronto,2.8


## 2.5 Read a json file with favorite dishes by country

In [2]:
# This json file comes from someone's github directory
# https://github.com/samayo/country-json/blob/master/src/country-by-national-dish.json

filepath = "resources\country-by-national-dish.json"
yummy_df = pd.read_json(filepath)
yummy_df.set_index("country", inplace=True)
yummy_df.head()

Unnamed: 0_level_0,dish
country,Unnamed: 1_level_1
Afghanistan,Kabuli Palaw
Albania,Tav‰ kosi
Algeria,Couscous
American Samoa,
Andorra,
